From 7d7bb0473a66d51a1a556630cc6a9f66e861f041 Mon Sep 17 00:00:00 2001 From: Evgeny Date: Tue, 17 Jul 2018 10:51:01 -0500 Subject: [PATCH 001/153] Initial commit --- CMakeLists.txt | 100 ++ DEBIAN/postinst | 19 + DEBIAN/prerm | 18 + LICENSE | 21 + README.md | 59 + RPM/rpm_post | 1 + RPM/rpm_postun | 1 + _clang-format | 60 + cmake_modules/env.cmake | 125 ++ cmake_modules/utils.cmake | 96 ++ doc/rocprofiler_spec.md | 592 ++++++++++ inc/rocprofiler.h | 364 ++++++ script/rpl_run.sh | 377 ++++++ script/tblextr.py | 119 ++ script/txt2xml.sh | 94 ++ src/CMakeLists.txt | 37 + src/core/context.h | 546 +++++++++ src/core/hsa_proxy_queue.h | 67 ++ src/core/hsa_queue.h | 80 ++ src/core/intercept_queue.cpp | 40 + src/core/intercept_queue.h | 230 ++++ src/core/metrics.cpp | 28 + src/core/metrics.h | 302 +++++ src/core/profile.h | 271 +++++ src/core/proxy_queue.cpp | 63 + src/core/proxy_queue.h | 77 ++ src/core/queue.h | 42 + src/core/rocprofiler.cpp | 522 ++++++++ src/core/simple_proxy_queue.cpp | 40 + src/core/simple_proxy_queue.h | 262 +++++ src/core/tracker.h | 188 +++ src/core/types.h | 37 + src/util/exception.h | 72 ++ src/util/hsa_rsrc_factory.cpp | 562 +++++++++ src/util/hsa_rsrc_factory.h | 288 +++++ src/util/logger.h | 191 +++ src/xml/expr.h | 446 +++++++ src/xml/xml.h | 457 +++++++ test/CMakeLists.txt | 62 + test/app/test.cpp | 40 + test/ctrl/run_kernel.h | 83 ++ test/ctrl/test_aql.h | 77 ++ test/ctrl/test_hsa.cpp | 283 +++++ test/ctrl/test_hsa.h | 124 ++ test/ctrl/test_kernel.h | 134 +++ test/run.sh | 61 + .../gfx8_SimpleConvolution.hsaco | Bin 0 -> 9392 bytes .../gfx9_SimpleConvolution.hsaco | Bin 0 -> 11136 bytes test/simple_convolution/simple_convolution.cl | 76 ++ .../simple_convolution/simple_convolution.cpp | 388 ++++++ test/simple_convolution/simple_convolution.h | 94 ++ test/tool/gfx_metrics.xml | 69 ++ test/tool/input.xml | 14 + test/tool/metrics.xml | 205 ++++ test/tool/tool.cpp | 1048 +++++++++++++++++ test/util/helper_funcs.h | 86 ++ test/util/hsa_rsrc_factory.cpp | 556 +++++++++ test/util/hsa_rsrc_factory.h | 284 +++++ test/util/perf_timer.cpp | 179 +++ test/util/perf_timer.h | 83 ++ test/util/test_assert.h | 46 + test/util/xml.h | 457 +++++++ 62 files changed, 11343 insertions(+) create mode 100644 CMakeLists.txt create mode 100644 DEBIAN/postinst create mode 100644 DEBIAN/prerm create mode 100644 LICENSE create mode 100644 README.md create mode 100644 RPM/rpm_post create mode 100644 RPM/rpm_postun create mode 100644 _clang-format create mode 100644 cmake_modules/env.cmake create mode 100644 cmake_modules/utils.cmake create mode 100644 doc/rocprofiler_spec.md create mode 100644 inc/rocprofiler.h create mode 100755 script/rpl_run.sh create mode 100755 script/tblextr.py create mode 100755 script/txt2xml.sh create mode 100644 src/CMakeLists.txt create mode 100644 src/core/context.h create mode 100644 src/core/hsa_proxy_queue.h create mode 100644 src/core/hsa_queue.h create mode 100644 src/core/intercept_queue.cpp create mode 100644 src/core/intercept_queue.h create mode 100644 src/core/metrics.cpp create mode 100644 src/core/metrics.h create mode 100644 src/core/profile.h create mode 100644 src/core/proxy_queue.cpp create mode 100644 src/core/proxy_queue.h create mode 100644 src/core/queue.h create mode 100644 src/core/rocprofiler.cpp create mode 100644 src/core/simple_proxy_queue.cpp create mode 100644 src/core/simple_proxy_queue.h create mode 100644 src/core/tracker.h create mode 100644 src/core/types.h create mode 100644 src/util/exception.h create mode 100644 src/util/hsa_rsrc_factory.cpp create mode 100644 src/util/hsa_rsrc_factory.h create mode 100644 src/util/logger.h create mode 100644 src/xml/expr.h create mode 100644 src/xml/xml.h create mode 100644 test/CMakeLists.txt create mode 100644 test/app/test.cpp create mode 100644 test/ctrl/run_kernel.h create mode 100644 test/ctrl/test_aql.h create mode 100644 test/ctrl/test_hsa.cpp create mode 100644 test/ctrl/test_hsa.h create mode 100644 test/ctrl/test_kernel.h create mode 100755 test/run.sh create mode 100644 test/simple_convolution/gfx8_SimpleConvolution.hsaco create mode 100755 test/simple_convolution/gfx9_SimpleConvolution.hsaco create mode 100644 test/simple_convolution/simple_convolution.cl create mode 100644 test/simple_convolution/simple_convolution.cpp create mode 100644 test/simple_convolution/simple_convolution.h create mode 100644 test/tool/gfx_metrics.xml create mode 100644 test/tool/input.xml create mode 100644 test/tool/metrics.xml create mode 100644 test/tool/tool.cpp create mode 100644 test/util/helper_funcs.h create mode 100644 test/util/hsa_rsrc_factory.cpp create mode 100644 test/util/hsa_rsrc_factory.h create mode 100644 test/util/perf_timer.cpp create mode 100644 test/util/perf_timer.h create mode 100644 test/util/test_assert.h create mode 100644 test/util/xml.h diff --git a/CMakeLists.txt b/CMakeLists.txt new file mode 100644 index 00000000..6249e098 --- /dev/null +++ b/CMakeLists.txt @@ -0,0 +1,100 @@ +################################################################################ +# Copyright (c) 2018 Advanced Micro Devices, Inc. All rights reserved. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +# THE SOFTWARE. +################################################################################ + +cmake_minimum_required ( VERSION 3.5.0 ) + +## Verbose output. +set ( CMAKE_VERBOSE_MAKEFILE TRUE CACHE BOOL "Verbose Output" FORCE ) + +## Set module name and project name. +set ( ROCPROFILER_NAME "rocprofiler" ) +set ( ROCPROFILER_TARGET "${ROCPROFILER_NAME}64" ) +set ( ROCPROFILER_LIBRARY "lib${ROCPROFILER_TARGET}" ) +project ( ${ROCPROFILER_TARGET} ) + +## Adding default path cmake modules +list ( APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/cmake_modules" ) +## Include common cmake modules +include ( utils ) +## Set build environment +include ( env ) + +## Setup the package version. +get_version ( "1.0.0" ) +message ( "-- LIB-VERSION: ${VERSION_MAJOR}.${VERSION_MINOR}.${VERSION_PATCH}" ) + +set ( BUILD_VERSION_MAJOR ${VERSION_MAJOR} ) +set ( BUILD_VERSION_MINOR ${VERSION_MINOR} ) +set ( BUILD_VERSION_PATCH ${VERSION_PATCH} ) +set ( LIB_VERSION_STRING "${BUILD_VERSION_MAJOR}.${BUILD_VERSION_MINOR}.${BUILD_VERSION_PATCH}" ) +if ( DEFINED VERSION_BUILD AND NOT ${VERSION_BUILD} STREQUAL "" ) + message ( "VERSION BUILD DEFINED ${VERSION_BUILD}" ) + set ( BUILD_VERSION_PATCH "${BUILD_VERSION_PATCH}-${VERSION_BUILD}" ) +endif () +set ( BUILD_VERSION_STRING "${BUILD_VERSION_MAJOR}.${BUILD_VERSION_MINOR}.${BUILD_VERSION_PATCH}" ) + +## Set target and root/lib/test directory +set ( TARGET_NAME "${ROCPROFILER_TARGET}" ) +set ( ROOT_DIR "${CMAKE_CURRENT_SOURCE_DIR}" ) +set ( LIB_DIR "${ROOT_DIR}/src" ) +set ( TEST_DIR "${ROOT_DIR}/test" ) + +## Build library +include ( ${LIB_DIR}/CMakeLists.txt ) + +## Set the VERSION and SOVERSION values +set_property ( TARGET ${TARGET_NAME} PROPERTY VERSION "${LIB_VERSION_STRING}" ) +set_property ( TARGET ${TARGET_NAME} PROPERTY SOVERSION "${BUILD_VERSION_MAJOR}" ) + +## If the library is a release, strip the target library +if ( "${CMAKE_BUILD_TYPE}" STREQUAL release ) + add_custom_command ( TARGET ${ROCPROFILER_TARGET} POST_BUILD COMMAND ${CMAKE_STRIP} *.so ) +endif () + +## Build tests +add_subdirectory ( ${TEST_DIR} ${PROJECT_BINARY_DIR}/test ) + +## Install information +install ( TARGETS ${ROCPROFILER_TARGET} LIBRARY DESTINATION ${ROCPROFILER_NAME}/lib ) +install ( FILES ${CMAKE_CURRENT_SOURCE_DIR}/inc/rocprofiler.h DESTINATION ${ROCPROFILER_NAME}/include ) + +## Packaging directives +set ( CPACK_PACKAGE_NAME "${ROCPROFILER_NAME}-dev" ) +set ( CPACK_PACKAGE_VENDOR "AMD" ) +set ( CPACK_PACKAGE_VERSION_MAJOR ${BUILD_VERSION_MAJOR} ) +set ( CPACK_PACKAGE_VERSION_MINOR ${BUILD_VERSION_MINOR} ) +set ( CPACK_PACKAGE_VERSION_PATCH ${BUILD_VERSION_PATCH} ) +set ( CPACK_PACKAGE_CONTACT "Advanced Micro Devices Inc." ) +set ( CPACK_PACKAGE_DESCRIPTION_SUMMARY "ROCPROFILER library for AMD HSA runtime API extension support" ) +set ( CPACK_RESOURCE_FILE_LICENSE "${CMAKE_CURRENT_SOURCE_DIR}/LICENSE" ) + +## Debian package specific variables +set ( CPACK_DEBIAN_PACKAGE_DEPENDS "hsa-rocr-dev" ) +set ( CPACK_DEBIAN_PACKAGE_HOMEPAGE "https://github.com/RadeonOpenCompute/HSA-RocProfiler" ) +set ( CPACK_DEBIAN_PACKAGE_CONTROL_EXTRA "${CMAKE_CURRENT_SOURCE_DIR}/DEBIAN/postinst;${CMAKE_CURRENT_SOURCE_DIR}/DEBIAN/prerm" ) + +## RPM package specific variables +set ( CPACK_RPM_PACKAGE_DEPENDS "hsa-rocr-dev" ) +set ( CPACK_RPM_PRE_INSTALL_SCRIPT_FILE "${CMAKE_CURRENT_SOURCE_DIR}/RPM/rpm_post" ) +set ( CPACK_RPM_POST_UNINSTALL_SCRIPT_FILE "${CMAKE_CURRENT_SOURCE_DIR}/RPM/rpm_postun" ) + +include ( CPack ) diff --git a/DEBIAN/postinst b/DEBIAN/postinst new file mode 100644 index 00000000..3d022884 --- /dev/null +++ b/DEBIAN/postinst @@ -0,0 +1,19 @@ +#/bin/bash + +set -e + +do_ldconfig() { + echo /opt/rocm/librocprofiler/lib > /etc/ld.so.conf.d/libhsa-rocprofiler64.conf && ldconfig +} + +case "$1" in + configure) + do_ldconfig + ;; + abort-upgrade|abort-remove|abort-deconfigure) + echo "$1" + ;; + *) + exit 0 + ;; +esac diff --git a/DEBIAN/prerm b/DEBIAN/prerm new file mode 100644 index 00000000..b3f509a9 --- /dev/null +++ b/DEBIAN/prerm @@ -0,0 +1,18 @@ +#!/bin/bash + +set -e + +rm_ldconfig() { + rm -f /etc/ld.so.conf.d/libhsa-rocprofiler64.conf && ldconfig +} + +case "$1" in + remove) + rm_ldconfig + ;; + purge) + ;; + *) + exit 0 + ;; +esac diff --git a/LICENSE b/LICENSE new file mode 100644 index 00000000..fe4ce68b --- /dev/null +++ b/LICENSE @@ -0,0 +1,21 @@ +/****************************************************************************** +Copyright (c) 2018 Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*******************************************************************************/ diff --git a/README.md b/README.md new file mode 100644 index 00000000..5492d17d --- /dev/null +++ b/README.md @@ -0,0 +1,59 @@ +# ROC-profiler + +ROC profiler library. Profiling with perf-counters and derived metrics. Library supports GFX8/GFX9. + +The library source tree: + - doc - Documentation + - inc/rocprofiler.h - Library public API + - src - Library sources + - core - Library API sources + - util - Library utils sources + - xml - XML parser + - test - Library test suite + - ctrl - Test controll + - util - Test utils + - simple_convolution - Simple convolution test kernel + +## Build environment: +``` + export CMAKE_PREFIX_PATH=: + export CMAKE_BUILD_TYPE= # release by default + export CMAKE_DEBUG_TRACE=1 # to enable debug tracing +``` + +## To build with the current installed ROCM: +``` + cd .../rocprofiler + mkdir build + cd build + cmake -DCMAKE_PREFIX_PATH=/opt/rocm/lib:/opt/rocm/include/hsa .. + make +``` + +## To run the test: +``` + cd .../rocprofiler/build + export LD_LIBRARY_PATH=.: # paths to ROC profiler and oher libraries + export HSA_TOOLS_LIB=librocprofiler64.so # ROC profiler library loaded by HSA runtime + export ROCP_TOOL_LIB=test/libtool.so # tool library loaded by ROC profiler + export ROCP_METRICS=metrics.xml # ROC profiler metrics config file + export ROCP_INPUT=input.xml # input file for the tool library + export ROCP_OUTPUT_DIR=./ # output directory for the tool library, for metrics results file 'results.txt' + +``` + +## Internal 'simple_convolution' test run script: +``` + cd .../rocprofiler/build + run.sh +``` + +## To enable error messages logging to '/tmp/rocprofiler_log.txt': +``` + export ROCPROFILER_LOG=1 +``` + +## To enable verbose tracing: +``` + export ROCPROFILER_TRACE=1 +``` diff --git a/RPM/rpm_post b/RPM/rpm_post new file mode 100644 index 00000000..57c5c811 --- /dev/null +++ b/RPM/rpm_post @@ -0,0 +1 @@ +echo /opt/rocm/librocprofiler/lib > /etc/ld.so.conf.d/libhsa-rocprofiler64.conf && ldconfig diff --git a/RPM/rpm_postun b/RPM/rpm_postun new file mode 100644 index 00000000..6b3c8f28 --- /dev/null +++ b/RPM/rpm_postun @@ -0,0 +1 @@ +rm -f /etc/ld.so.conf.d/libhsa-rocprofiler64.conf && ldconfig diff --git a/_clang-format b/_clang-format new file mode 100644 index 00000000..0c81671e --- /dev/null +++ b/_clang-format @@ -0,0 +1,60 @@ +--- +Language: Cpp +# BasedOnStyle: Google +AccessModifierOffset: -1 +ConstructorInitializerIndentWidth: 4 +AlignEscapedNewlinesLeft: false +AlignTrailingComments: true +AlignConsecutiveAssignments: false +AlignOperands: false +AllowAllParametersOfDeclarationOnNextLine: true +AllowShortBlocksOnASingleLine: false +AllowShortIfStatementsOnASingleLine: true +AllowShortLoopsOnASingleLine: true +AllowShortFunctionsOnASingleLine: All +AlwaysBreakAfterDefinitionReturnType: false +AlwaysBreakTemplateDeclarations: false +AlwaysBreakBeforeMultilineStrings: true +BreakBeforeBinaryOperators: false +BreakBeforeTernaryOperators: true +BreakConstructorInitializersBeforeComma: false +BinPackParameters: true +ColumnLimit: 100 +ConstructorInitializerAllOnOneLineOrOnePerLine: true +ExperimentalAutoDetectBinPacking: false +IndentCaseLabels: true +IndentWrappedFunctionNames: false +IndentFunctionDeclarationAfterType: false +MaxEmptyLinesToKeep: 2 +KeepEmptyLinesAtTheStartOfBlocks: false +NamespaceIndentation: None +ObjCSpaceAfterProperty: false +ObjCSpaceBeforeProtocolList: false +PenaltyBreakBeforeFirstCallParameter: 1 +PenaltyBreakComment: 300 +PenaltyBreakString: 1000 +PenaltyBreakFirstLessLess: 120 +PenaltyExcessCharacter: 1000000 +PenaltyReturnTypeOnItsOwnLine: 200 +DerivePointerAlignment: false +PointerAlignment: Left +SpacesBeforeTrailingComments: 2 +Cpp11BracedListStyle: true +Standard: Auto +IndentWidth: 2 +TabWidth: 8 +UseTab: Never +BreakBeforeBraces: Attach +SpacesInParentheses: false +SpacesInAngles: false +SpaceInEmptyParentheses: false +SpacesInCStyleCastParentheses: false +SpacesInContainerLiterals: true +SpaceBeforeAssignmentOperators: true +ContinuationIndentWidth: 4 +CommentPragmas: '^ IWYU pragma:' +ForEachMacros: [ foreach, Q_FOREACH, BOOST_FOREACH ] +SpaceBeforeParens: ControlStatements +DisableFormat: false +SortIncludes: false +... diff --git a/cmake_modules/env.cmake b/cmake_modules/env.cmake new file mode 100644 index 00000000..ca7c4804 --- /dev/null +++ b/cmake_modules/env.cmake @@ -0,0 +1,125 @@ +################################################################################ +# Copyright (c) 2018 Advanced Micro Devices, Inc. All rights reserved. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +# THE SOFTWARE. +################################################################################ + +## Build is not supported on Windows plaform +if ( WIN32 ) + message ( FATAL_ERROR "Windows build is not supported." ) +endif () + +## Compiler Preprocessor definitions. +add_definitions ( -D__linux__ ) +add_definitions ( -DUNIX_OS ) +add_definitions ( -DLINUX ) +add_definitions ( -D__AMD64__ ) +add_definitions ( -D__x86_64__ ) +add_definitions ( -DAMD_INTERNAL_BUILD ) +add_definitions ( -DLITTLEENDIAN_CPU=1 ) +add_definitions ( -DHSA_LARGE_MODEL= ) +add_definitions ( -DHSA_DEPRECATED= ) + +## Linux Compiler options +set ( CMAKE_CXX_FLAGS "-std=c++11") +set ( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wall" ) +set ( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Werror" ) +set ( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Werror=return-type" ) +set ( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fexceptions" ) +set ( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fvisibility=hidden" ) +set ( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fno-math-errno" ) +set ( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fno-threadsafe-statics" ) +set ( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fmerge-all-constants" ) +set ( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fms-extensions" ) +set ( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fmerge-all-constants" ) +set ( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fPIC" ) + +set ( CMAKE_SHARED_LINKER_FLAGS "-Wl,-Bdynamic -Wl,-z,noexecstack" ) + +set ( CMAKE_SKIP_BUILD_RPATH TRUE ) + +## CLANG options +if ( "$ENV{CXX}" STREQUAL "/usr/bin/clang++" ) + set ( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -ferror-limit=1000000" ) +endif() + +## Enable debug trace +if ( DEFINED ENV{CMAKE_DEBUG_TRACE} ) + add_definitions ( -DDEBUG_TRACE=1 ) +endif() + +## Enable direct loading of AQL-profile HSA extension +if ( DEFINED ENV{CMAKE_LD_AQLPROFILE} ) + add_definitions ( -DROCP_LD_AQLPROFILE=1 ) +endif() + +## Make env vars +if ( NOT DEFINED CMAKE_BUILD_TYPE OR "${CMAKE_BUILD_TYPE}" STREQUAL "" ) + if ( DEFINED ENV{CMAKE_BUILD_TYPE} ) + set ( CMAKE_BUILD_TYPE $ENV{CMAKE_BUILD_TYPE} ) + endif() +endif() +if ( NOT DEFINED CMAKE_PREFIX_PATH AND DEFINED ENV{CMAKE_PREFIX_PATH} ) + set ( CMAKE_PREFIX_PATH $ENV{CMAKE_PREFIX_PATH} ) +endif() + +## Extend Compiler flags based on build type +string ( TOLOWER "${CMAKE_BUILD_TYPE}" CMAKE_BUILD_TYPE ) +if ( "${CMAKE_BUILD_TYPE}" STREQUAL debug ) + set ( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -ggdb" ) + set ( CMAKE_BUILD_TYPE "debug" ) +else () + set ( CMAKE_BUILD_TYPE "release" ) +endif () + +## Extend Compiler flags based on Processor architecture +if ( ${CMAKE_SYSTEM_PROCESSOR} STREQUAL "x86_64" ) + set ( NBIT 64 ) + set ( NBITSTR "64" ) + set ( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -m64 -msse -msse2" ) +elseif ( ${CMAKE_SYSTEM_PROCESSOR} STREQUAL "x86" ) + set ( NBIT 32 ) + set ( NBITSTR "" ) + set ( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -m32" ) +endif () + +## Find hsa-runtime headers/lib +find_file ( HSA_RUNTIME_INC "hsa.h" ) +if ( "${HSA_RUNTIME_INC_PATH}" STREQUAL "" ) + find_file ( HSA_RUNTIME_INC "hsa/hsa.h" ) +endif() +find_library ( HSA_RUNTIME_LIB "libhsa-runtime${NBIT}.so" ) +get_filename_component ( HSA_RUNTIME_INC_PATH ${HSA_RUNTIME_INC} DIRECTORY ) +get_filename_component ( HSA_RUNTIME_LIB_PATH ${HSA_RUNTIME_LIB} DIRECTORY ) + +find_library ( HSA_KMT_LIB "libhsakmt.so" ) +get_filename_component ( HSA_KMT_LIB_PATH ${HSA_KMT_LIB} DIRECTORY ) + +set ( API_PATH ${HSA_RUNTIME_INC_PATH} ) + +## Basic Tool Chain Information +message ( "----------------NBIT: ${NBIT}" ) +message ( "-----------BuildType: ${CMAKE_BUILD_TYPE}" ) +message ( "------------Compiler: ${CMAKE_CXX_COMPILER}" ) +message ( "----Compiler-Version: ${CMAKE_CXX_COMPILER_VERSION}" ) +message ( "-----HSA-Runtime-Inc: ${HSA_RUNTIME_INC_PATH}" ) +message ( "-----HSA-Runtime-Lib: ${HSA_RUNTIME_LIB_PATH}" ) +message ( "------------API-path: ${API_PATH}" ) +message ( "-----CMAKE_CXX_FLAGS: ${CMAKE_CXX_FLAGS}" ) +message ( "---CMAKE_PREFIX_PATH: ${CMAKE_PREFIX_PATH}" ) diff --git a/cmake_modules/utils.cmake b/cmake_modules/utils.cmake new file mode 100644 index 00000000..15865820 --- /dev/null +++ b/cmake_modules/utils.cmake @@ -0,0 +1,96 @@ +################################################################################ +# Copyright (c) 2018 Advanced Micro Devices, Inc. All rights reserved. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +# THE SOFTWARE. +################################################################################ + +## Parses the VERSION_STRING variable and places +## the first, second and third number values in +## the major, minor and patch variables. +function( parse_version VERSION_STRING ) + + string ( FIND ${VERSION_STRING} "-" STRING_INDEX ) + + if ( ${STRING_INDEX} GREATER -1 ) + math ( EXPR STRING_INDEX "${STRING_INDEX} + 1" ) + string ( SUBSTRING ${VERSION_STRING} ${STRING_INDEX} -1 VERSION_BUILD ) + endif () + + string ( REGEX MATCHALL "[0123456789]+" VERSIONS ${VERSION_STRING} ) + list ( LENGTH VERSIONS VERSION_COUNT ) + + if ( ${VERSION_COUNT} GREATER 0) + list ( GET VERSIONS 0 MAJOR ) + set ( VERSION_MAJOR ${MAJOR} PARENT_SCOPE ) + set ( TEMP_VERSION_STRING "${MAJOR}" ) + endif () + + if ( ${VERSION_COUNT} GREATER 1 ) + list ( GET VERSIONS 1 MINOR ) + set ( VERSION_MINOR ${MINOR} PARENT_SCOPE ) + set ( TEMP_VERSION_STRING "${TEMP_VERSION_STRING}.${MINOR}" ) + endif () + + if ( ${VERSION_COUNT} GREATER 2 ) + list ( GET VERSIONS 2 PATCH ) + set ( VERSION_PATCH ${PATCH} PARENT_SCOPE ) + set ( TEMP_VERSION_STRING "${TEMP_VERSION_STRING}.${PATCH}" ) + endif () + + if ( DEFINED VERSION_BUILD ) + set ( VERSION_BUILD "${VERSION_BUILD}" PARENT_SCOPE ) + endif () + + set ( VERSION_STRING "${TEMP_VERSION_STRING}" PARENT_SCOPE ) + +endfunction () + +## Gets the current version of the repository +## using versioning tags and git describe. +## Passes back a packaging version string +## and a library version string. +function ( get_version DEFAULT_VERSION_STRING ) + + parse_version ( ${DEFAULT_VERSION_STRING} ) + + find_program ( GIT NAMES git ) + + if ( GIT ) + + execute_process ( COMMAND "git describe --dirty --long --match [0-9]* 2>/dev/null" + WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} + OUTPUT_VARIABLE GIT_TAG_STRING + OUTPUT_STRIP_TRAILING_WHITESPACE + RESULT_VARIABLE RESULT ) + + if ( ${RESULT} EQUAL 0 ) + + parse_version ( ${GIT_TAG_STRING} ) + + endif () + + endif () + + set( VERSION_STRING "${VERSION_STRING}" PARENT_SCOPE ) + set( VERSION_MAJOR "${VERSION_MAJOR}" PARENT_SCOPE ) + set( VERSION_MINOR "${VERSION_MINOR}" PARENT_SCOPE ) + set( VERSION_PATCH "${VERSION_PATCH}" PARENT_SCOPE ) + set( VERSION_BUILD "${VERSION_BUILD}" PARENT_SCOPE ) + +endfunction() diff --git a/doc/rocprofiler_spec.md b/doc/rocprofiler_spec.md new file mode 100644 index 00000000..001bcbe1 --- /dev/null +++ b/doc/rocprofiler_spec.md @@ -0,0 +1,592 @@ +# ROC Profiler Library Specification + +## 1. High level overview +``` +The goal of the implementation is to provide a HW specific low-level performance analysis +interface for profiling of GPU compute applications. The profiling includes HW performance +counters with complex performance metrics and HW traces. The implementation distinguishes +two profiling features, metrics and traces. HW performance counters are treated as the basic +metrics and the formulas can be defined for derived complex metrics. +The library can be loaded by HSA runtime as a tool plugin and it can be loaded by higher +level HW independent performance analysis API like PAPI. +The library has C API and is based on AQLprofile AMD specific HSA extension. + + 1. The library provides methods to query the list of supported HW features. + 2. The library provides profiling APIs to start, stop, read metrics results and tracing + data. + 3. The library provides a callback API for collecting per-kernel profiling data for + the kernels + dispatched to HSA AQL queues. + 4. The library provides mechanism to load profiling tool library plugin by env variable + ROCP_TOOL_LIB. + 5. The library is responsible for allocation of the buffers for profiling and notifying + about output data buffer overflow for traces. + 6. The library is implemented based on AMD specific AQLprofile HSA extension. + 7. The library implementation is abstracted from the specific GFXIP. + 8. The library implementation is extensible: + - Easy adding of counters and metrics + - Counters enumeration + - Counters and metrics can be dynamically configured using XML configuration files with + counters and metrics tables: + o Counters table entry, basic metric: counter name, block name, event id + o Complex metrics table entry: metric name, an expression for calculation the metric + from the counters + +Metrics XML file example: + + + + . . . + + + + . . . + + + + + +``` +## 2. Environment +``` +* HSA_TOOLS_LIB - required to be set to the name of rocprofiler library to be loaded by +HSA runtime +* ROCP_METRICS - path to the metrics XML file +* ROCP_TOOL_LIB - path to profiling tool library loaded by ROC Profiler +* ROCP_HSA_INTERCEPT - if set then HSA dispatches intercepting is enabled +``` +## 3. General API +### 3.1. Description +``` +The library supports method for getting the error number and error string of the last +failed library API call. +To check the conformance of used library APi header and the library binary the version +macros and API methods can be used. + +Returning the error and error string methods: +- rocprofiler_errno - method for returning the error number +- rocprofiler_error_string - method for returning the error string + +Library version: +- ROCPROFILER_VERSION_MAJOR - API major version macro +- ROCPROFILER_VERSION_MINOR - API minor version macro +- rocprofiler_version_major - library major version +- rocprofiler_version_minor - library minor version +``` +### 3.2. Returning the error and error string methods +``` +rocprofiler_errno_t rocprofiler_errno(); +const char* rocprofiler_error_string(); +``` +### 3.3. Library version +``` +The library provides back compatibility if the library major version is less or equal +then the API major version macro. + +API version macros defined in the library API header 'rocprofiler.h': + +ROCPROFILER_VERSION_MAJOR +ROCPROFILER_VERSION_MINOR + +Methods to check library major and minor venison: + +uint32_t rocprofiler_major_version(); +uint32_t rocprofiler_minor_version(); +``` +## 4. Backend API +### 4.1. Description +``` +The library provides the methods to open/close profiling context, to start, stop and read +HW performance counters and traces, to intercept kernel dispatches to collect per-kernel +profiling data. Also the library provides methods to calculate complex performance metrics +and to query the list of available metrics. The library distinguishes two profiling features, +metrics and traces, where HW performance counters are treated as the basic metrics. To check +if there was an error the library methods return HSA standard status code. +For a given context the profiling can be started/stopped and counters sampled in standalone +mode or profiling can be initiated by intercepting the kernel dispatches with registering +a dispatch callback. +For counters sampling, which is the usage model of higher level APIs like PAPI, +the start/stop/read APIs should be used. +For collecting per-kernel data for the submitted to HSA queues kernels the dispatch callback +API should be used. +The library provides back compatibility if the library major version is less or equal. + +Returned API status: +- hsa_status_t - HSA status codes are used from hsa.h header + +Info API: +- rocprofiler_info_kind_t - profiling info kind +- rocprofiler_info_query_t - profiling info query +- rocprofiler_info_data_t - profiling info data +- rocprofiler_get_info - return the info for a given info kind +- rocprofiler_iterate_info - iterate over the info for a given info kind +- rocprofiler_query_info - iterate over the info for a given info query + +Context API: +- rocprofiler_t - profiling context handle +- rocprofiler_feature_kind_t - profiling feature kind +- rocprofiler_feature_parameter_t - profiling feature parameter +- rocprofiler_data_kind_t - profiling data kind +- rocprofiler_data_t - profiling data +- rocprofiler_feature_t - profiling feature +- rocprofiler_mode_t - profiling modes +- rocprofiler_properties_t - profiler properties +- rocprofiler_open - open new profiling context +- rocprofiler_close - close profiling context and release all allocated resources +- rocprofiler_group_count - return profiling groups count +- rocprofiler_get_group - return profiling group for a given index +- rocprofiler_get_metrics - method for calculating the metrics data +- rocprofiler_iterate_trace_data - method for iterating output trace data instances + +Sampling API: +- rocprofiler_start - start profiling +- rocprofiler_stop - stop profiling +- rocprofiler_read - read profiling data to the profiling features objects +- rocprofiler_get_data - wait for profiling data + Group versions of start/stop/read/get_data methods: + o rocprofiler_group_start + o rocprofiler_group_stop + o rocprofiler_group_read + o rocprofiler_group_get_data + +Intercepting API: +- rocprofiler_callback_t - profiling callback type +- rocprofiler_callback_data_t - profiling callback data type +- rocprofiler_set_queue_callbacks - set queue kernel dispatch and queue destroy callbacks +- rocprofiler_remove_queue_callbacks - remove queue callbacks +``` +### 4.2. Info API +``` +The profiling metrics are defined by name and the traces are defined by name and parameters. +All supported features can be iterated using 'iterate_info/query_info' methods. The counter +names are defined in counters table configuration file, each counter has a unique name and +defined by block name and event id. The traces and trace parameters names are same as in +the hardware documentation and the parameters codes are rocprofiler_feature_parameter_t values, +see below in the "Context API" section. +Profiling info kind: + +typedef enum { + ROCPROFILER_INFO_KIND_METRIC = 0, // metric info + ROCPROFILER_INFO_KIND_METRIC_COUNT = 1, // metrics count + ROCPROFILER_INFO_KIND_TRACE = 2, // trace info + ROCPROFILER_INFO_KIND_TRACE_COUNT = 3, // traces count +} rocprofiler_info_kind_t; + +Profiling info data: + +typedef struct { + rocprofiler_info_kind_t kind; // info data kind + union { + struct { + const char* name; // metric name + const char* description; // metric description + } metric; + struct { + const char* name; // trace name + const char* description; // trace description + uint32_t parameter_count; // supported by the trace number + // parameters + } trace; + }; +} rocprofiler_info_data_t; + +Return info for a given info kind: + +has_status_t rocprofiler_get_info( + const hsa_agent_t* agent, // [in] GPU handle, NULL for all + // GPU agents + rocprofiler info_kind_t kind, // kind of iterated info + void *data); // data passed to callback + +Iterate over the info for a given info kind, and invoke an application-defined callback on +every iteration: + +has_status_t rocprofiler_iterate_info( + const hsa_agent_t* agent, // [in] GPU handle, NULL for all + // GPU agents + rocprofiler info_kind_t kind, // kind of iterated info + hsa_status_t (*callback)(const rocprofiler_info_data_t info, void *data), // callback + void *data); + +Iterate over the info for a given info query, and invoke an application-defined callback on +every iteration. The query +fields set to NULL define the query wildcard: + +has_status_t rocprofiler_query_info( + const hsa_agent_t* agent, // [in] GPU handle, NULL for all + // GPU agents + rocprofiler info_kind_t kind, // kind of iterated info + rocprofiler_info_data_t query, // info query + hsa_status_t (*callback)(const rocprofiler_info_data_t info, void *data), // callback + void *data); // data passed to callback +``` +### 4.3. Context API +``` +Profiling context is accumulating all profiling information including profiling features +which carry profiling data, required buffers for profiling command packets and output data. +The context can be created and deleted by the library open/close methods. By deleting +the context all accumulated by the library resources associated with this context will be +released. If it is required more than one run to collect all requested counters data then +data for all profiling groups should be collected and then the metrics can be calculated by +loading the saved groups' data to the profiling context. Saving and loading of the groups +data is responsibility of the tool. The groups are automatically identified on the profiling +context open and there is API to access them, see the "Profiling groups" section below. + +Profiling context handle: + +typename rocprofiler_t; + +Profiling feature kind: + +typedef enum { + ROCPROFILER_FEATURE_KIND_METRIC = 0, // metric + ROCPROFILER_FEATURE_KIND_TRACE = 1 // trace +} rocprofiler_feature_kind_t; + +Profiling feature parameter: + +typedef hsa_ven_amd_aqlprofile_parameter_t rocprofiler_feature_parameter_t; + +Profiling data kind: + +typedef enum { + ROCPROFILER_DATA_KIND_UNINIT = 0, // data uninitialized + ROCPROFILER_DATA_KIND_INT32 = 1, // 32bit integer + ROCPROFILER_DATA_KIND_INT64 = 2, // 64bit integer + ROCPROFILER_DATA_KIND_FLOAT = 3, // float single-precision result + ROCPROFILER_DATA_KIND_DOUBLE = 4, // float double-precision result + ROCPROFILER_DATA_KIND_BYTES = 5 // trace output as a bytes array +} rocprofiler_data_kind_t; + + +Profiling data: + +typedef struct { + rocprofiler_data_kind_t kind; // result kind + union { + uint32_t result_int32; // 32bit integer result + uint64_t result_int64; // 64bit integer result + float result_float; // float single-precision result + double result_double; // float double-precision result + typedef struct { + void* ptr; // pointer + uint32_t size; // byte size + uint32_t instances; // number of trace instances + } result_bytes; // data by ptr and byte size + }; +} rocprofiler_data_t; + +Profiling feature: + +typedef struct { + rocprofiler_feature_kind_t type; // feature type + const char* name; // feature name + const rocprofiler_feature_parameter_t* parameters; // feature parameters + uint32_t parameter_count; // feature parameter count + rocprofiler_data_t* data; // profiling data +} rocprofiler_feature_t; + +Profiling mode masks: +There are several modes which can be specified for the profiling context. +STANDALONE mode can be used for the counters sampling in another then application context +to support statistical system wide profiling. In this mode the profiling context supports +its own queue which can be created on the context open if the CREATEQUEUE mode also specified. +See also "Profiler properties" section below for the standalone mode queue properties. +The profiler supports several profiling groups for collecting profiling data in several +runs and 'SINGLEGROUP' mode allows only one group and the context open will fail if more +groups are needed. + +typedef enum { + ROCPROFILER_MODE_STANDALONE = 1, // standalone mode when ROC profiler + // supports own AQL queue + ROCPROFILER_MODE_CREATEQUEUE = 2, // profiler creates queue in STANDALONE mode + ROCPROFILER_MODE_SINGLEGROUP = 4 // profiler allows one group only and fails + // if more groups are needed +} rocprofiler_mode_t; + +Context data readiness callback: + +typedef void (*rocprofiler_context_callback_t)( + rocprofiler_group_t* group, // profiling group + void* arg); // callback arg + +Profiler properties: +There are several properties which can be specified for the context. A callback can be +registered which will be called when the context data is ready. In standalone profiling mode +'ROCPROFILER_MODE_STANDALONE' the context supports its own queue and the queue can be set by +the property 'queue' or a queue will be created with the specified depth 'queue_depth' if mode +'ROCPROFILER_MODE_CREATEQUEUE' also specified. + +typedef struct { + rocprofiler_context_callback_t callback; // callback on the context data readiness + void* callback_arg; // callback arg + has_queue_t* queue; // HSA queue for standalone mode + uint32_t queue_depth; // created queue depth,for create-queue mode +} rocprofiler_properties_t; + +Open/close profiling context: + +hsa_status_t rocprofiler_open( + hsa_agent_t agent, // GPU handle + rocprofiler_feature_t* features, // [in/out] profiling feature array + uint32_t feature_count, // profiling feature count + rocprofiler_t** context, // [out] profiling context handle + uint32_t mode, // profiling mode mask + rocprofiler_properties_t* properties); // profiler properties + +hsa_status_t rocprofiler_close( + rocprofiler_t* context); // [in] profiling context + +Profiling groups: +The profiler on the context open automatically identifies a required number of the application +runs to collect all data needed for all specified metrics and creates a metric group per each +run. Data for all profiling groups should be collected and then the metrics can be calculated +by loading the saved groups' data to the profiling context. Saving and loading of he groups +data is responsibility of the tool. + +typedef struct { + uint32_t index; // profiling group index + rocprofiler_feature_t** features; // profiling features array + uint32_t feature_count; // profiling feature count + rocprofiler_t* context; // profiling context handle +} rocprofiler_group_t; + +Return profiling groups count: + +hsa_status_t rocprofiler_group_count( + rocprofiler_t* context); // [in/out] profiling context + uint32* count); // [out] profiling groups count + +Return the profiling group for a given index: + +hsa_status_t rocprofiler_get_group( + rocprofiler_t* context, // [in/out] profiling context, + // will be returned as + // a part of the group structure + uint32_t index, // [in] group index + rocprofiler_group_t* group); // [out] profiling group + +Calculate metrics data. The data will be stored to the registered profiling features data fields: +After all profiling context data is ready the registered metrics can be calculated. The context +data readiness can be checked by 'get_data' API or using the context callback. + +hsa_status_t rocprofiler_get_metrics( + rocprofiler_t* context); // [in/out] profiling context + +Method for iterating trace data instances: +Trace data can have several instance, for example, one instance per Shader Engine. + +hsa_status_t rocprofiler_iterate_trace_data( + const rocprofiler_t* contex, // [in] context object + hsa_ven_amd_aqlprofile_data_callback_t callback, // [in] callback to iterate + // the output data + void* callback_data); // [in/out] passed to callback data +``` +### 4.4. Sampling API +``` +The API supports the counters sampling usage model with start/read/stop methods and also lets +to wait for the profiling data in the intercepting usage model with get_data method. + +Start/stop/read methods: + +hsa_status_t rocprofiler_start( + rocprofiler_t* context, // [in/out] profiling context + uint32_t group_index = 0); // group index + +hsa_status_t rocprofiler_stop( + rocprofiler_t* context, // [in/out] profiling context + uint32_t group_index = 0); // group index + +hsa_status_t rocprofiler_read( + rocprofiler_t* context, // [in/out] profiling context + uint32_t group_index = 0); // group index + +Wait for profiling data: + +hsa_status_t rocprofiler_get_data( + rocprofiler_t* context, // [in/out] profiling context + uint32_t group_index = 0); // group index + +Group versions of the above start/stop/read/get_data methods: + +hsa_status_t rocprofiler_group_start( + rocprofiler_group_t* group); // [in/out] profiling group + +hsa_status_t rocprofiler_group_stop( + rocprofiler_group_t* group); // [in/out] profiling group + + +hsa_status_t rocprofiler_group_read( + rocprofiler_group_t* group); // [in/out] profiling group + + +hsa_status_t rocprofiler_group_get_data( + rocprofiler_group_t* group); // [in/out] profiling group +``` +### 4.5. Intercepting API +``` +The library provides a callback API for enabling profiling for the kernels dispatched to +HSA AQL queues. The API enables per-kernel profiling data collection. + +ROC profiler callback type: + +hsa_status_t (*rocprofiler_callback_t)( + const rocprofiler_callback_data_t* callback_data, // callback data passed by HSA runtime + void* user_data, // [in/out] user data passed + // to the callback + rocprofiler_group** group); // [out] returned profiling group + +Profiling callback data: + +typedef struct { + uint64_t dispatch; // dispatch timestamp + uint64_t begin; // begin timestamp + uint64_t end; // end timestamp + uint64_t complete; // completion signal timestamp +} rocprofiler_dispatch_record_t; + +typedef struct { + hsa_agent_t agent; // GPU agent handle + uint32_t agent_index; // GPU index + const hsa_queue_t* queue; // HSA queue + uint64_t queue_index; // Index in the queue + const hsa_kernel_dispatch_packet_t* packet; // HSA dispatch packet + const char* kernel_name; // Kernel name + const rocprofiler_dispatch_record_t* record; // Dispatch record +} rocprofiler_callback_data_t; + +Queue callbacks: + +typedef struct { + rocprofiler_callback_t dispatch; // kernel dispatch callback + hsa_status_t (*destroy)(hsa_queue_t* queue, void* data); // queue destroy callback +} rocprofiler_queue_callbacks_t; + +Adding/removing kernel dispatch and queue destroy callbacks + +hsa_status_t rocprofiler_set_intercepting( + rocprofiler_intercepting_t callbacks, // intercepting callbacks + void* data); // [in/out] passed callbacks data + +hsa_status_t rocprofiler_remove_intercepting(); +``` +## 5. Application code examples +### 5.1. Querying available metrics +``` +Info data callback: + + hsa_status_t info_data_callback(const rocprofiler_info_data_t info, void *data) { + switch (info.kind) { + case ROCPROFILER_INFO_KIND_METRIC: { + printf("metric %s, description %s\n", + info.metric.name, + info.metric.description); + break; + } + default: + printf("wrong info kind %u\n", kind); + return HSA_STATUS_ERROR; + } + return HSA_STATUS_SUCCESS; + } + +Printing all available metrics: + + hsa_status_t status = rocprofiler_iterate_info( + agent, + ROCPROFILER_INFO_KIND_METRIC, + info_data_callback, + NULL); + +``` +### 5.2. Profiling code example +``` +Profiling of L1 miss ratio, average memory bandwidth. +In the example below rocprofiler_group_get_data group APIs are used for the purpose of a usage +example but in SINGLEGROUP mode when only one group is allowed the context handle itself can be +saved and then direct context method rocprofiler_get_data with default group index equal to 0 +can be used. + +hsa_status_t_dispatch_callback( + const rocprofiler_callback_data_t* callback_data, + void* user_data, + rocprofiler_group_t* group) +{ + hsa_status_t status = HSA_STATUS_SUCCESS; + // Profiling context + rocprofiler_t* context; + // Profiling info objects + rocprofiler_feature_t features* = new rocprofiler_feature_t[2]; + // Tracing parameters + rocprofiler_feature_parameter_t* parameters = new rocprofiler_feature_parameter_t[2]; + + // Setting profiling features + features[0].type = ROCPROFILER_METRIC; + features[0].name = "L1_MISS_RATIO"; + features[1].type = ROCPROFILER_METRIC; + features[1].name = "DRAM_BANDWIDTH"; + + // Creating profiling context + status = rocprofiler_open(callback_data->dispatch.agent, features, 2, &context, + ROCPROFILER_MODE_SINGLEGROUP, NULL); + + + // Get the profiling group + // For general case with many groups there is rocprofiler_group_count() API + const uint32_t group_index = 0 + status = rocprofiler_get_group(context, group_index, group); + + + // In SINGLEGROUP mode the context handle itself can be saved, because there is just one group + + + return status; +} + +void profiling_libary_constructor() { + // Defining callback data, no data in this simple example + void* callback_data = NULL; + + // Adding observers + hsa_sttaus_t status = rocprofiler_add_dispatch_callback(dispatch_callback, callback_data); + + + // Dispatching profiled kernel + +} + +void profiling_libary_destructor() { + > { + // In SINGLEGROUP mode the rocprofiler_get_group() method with default zero group + // index can be used, if context handle would be saved + status = rocprofiler_group_get_data(entry->group); + + status = rocprofiler_get_metrics(entry->group->context); + + status = rocprofiler_close(entry->group->context); + + + dispatch_data, entry->features, entry->features_count)>; + } +} +``` +### 5.3. Option to use completion callback +``` +Creating profiling context with completion callback: + . . . + rocprofiler_properties_t properties = {}; + properties.callback = completion_callback; + properties.callback_arg = NULL; // no args defined + status = rocprofiler_open(agent, features, 3, &context, + ROCPROFILER_MODE_SINGLEGROUP, properties); + + . . . + +Definition of completion callback: + +void completion_callback(profiler_group_t group, void* arg) { + + hsa_status_t status = rocprofiler_close(group.context); + +} +``` diff --git a/inc/rocprofiler.h b/inc/rocprofiler.h new file mode 100644 index 00000000..e7a5a1e0 --- /dev/null +++ b/inc/rocprofiler.h @@ -0,0 +1,364 @@ +/****************************************************************************** +Copyright (c) 2018 Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*******************************************************************************/ + +//////////////////////////////////////////////////////////////////////////////// +// +// ROC Profiler API +// +// The goal of the implementation is to provide a HW specific low-level +// performance analysis interface for profiling of GPU compute applications. +// The profiling includes HW performance counters with complex +// performance metrics and HW traces. +// +// The library can be used by a tool library loaded by HSA runtime or by +// higher level HW independent performance analysis API like PAPI. +// +// The library is written on C and will be based on AQLprofile AMD specific +// HSA extension. The library implementation requires HSA API intercepting and +// a profiling queue supporting a submit callback interface. +// +//////////////////////////////////////////////////////////////////////////////// + +#ifndef INC_ROCPROFILER_H_ +#define INC_ROCPROFILER_H_ + +#include +#include +#include + +#define ROCPROFILER_VERSION_MAJOR 1 +#define ROCPROFILER_VERSION_MINOR 1 + +#ifdef __cplusplus +extern "C" { +#endif // __cplusplus + +//////////////////////////////////////////////////////////////////////////////// +// Returning library version + +uint32_t rocprofiler_version_major(); +uint32_t rocprofiler_version_minor(); + +//////////////////////////////////////////////////////////////////////////////// +// Global properties structure + +typedef struct { + uint32_t intercept_mode; + uint32_t sqtt_size; + uint32_t sqtt_local; + uint64_t timeout; + uint32_t timestamp_on; +} rocprofiler_settings_t; + +//////////////////////////////////////////////////////////////////////////////// +// Returning the error string method + +hsa_status_t rocprofiler_error_string( + const char** str); // [out] the API error string pointer returning + +//////////////////////////////////////////////////////////////////////////////// +// Profiling features and data +// +// Profiling features objects have profiling feature info, type, parameters and data +// Also profiling data samplaes can be iterated using a callback + +// Profiling feature kind +typedef enum { + ROCPROFILER_FEATURE_KIND_METRIC = 0, + ROCPROFILER_FEATURE_KIND_TRACE = 1 +} rocprofiler_feature_kind_t; + +// Profiling feture parameter +typedef hsa_ven_amd_aqlprofile_parameter_t rocprofiler_parameter_t; + +// Profiling data kind +typedef enum { + ROCPROFILER_DATA_KIND_UNINIT = 0, + ROCPROFILER_DATA_KIND_INT32 = 1, + ROCPROFILER_DATA_KIND_INT64 = 2, + ROCPROFILER_DATA_KIND_FLOAT = 3, + ROCPROFILER_DATA_KIND_DOUBLE = 4, + ROCPROFILER_DATA_KIND_BYTES = 5 +} rocprofiler_data_kind_t; + +// Profiling data type +typedef struct { + rocprofiler_data_kind_t kind; // result kind + union { + uint32_t result_int32; // 32bit integer result + uint64_t result_int64; // 64bit integer result + float result_float; // float single-precision result + double result_double; // float double-precision result + struct { + void* ptr; + uint32_t size; + uint32_t instance_count; + bool copy; + } result_bytes; // data by ptr and byte size + }; +} rocprofiler_data_t; + +// Profiling feature type +typedef struct { + rocprofiler_feature_kind_t kind; // feature kind + union { + const char* name; // feature name + struct { + const char* block; // counter block name + uint32_t event; // counter event id + } counter; + }; + const rocprofiler_parameter_t* parameters; // feature parameters array + uint32_t parameter_count; // feature parameters count + rocprofiler_data_t data; // profiling data +} rocprofiler_feature_t; + +// Profiling features set type +typedef void rocprofiler_feature_set_t; + +//////////////////////////////////////////////////////////////////////////////// +// Profiling context +// +// Profiling context object accumuate all profiling information + +// Profiling context object +typedef void rocprofiler_t; + +// Profiling group object +typedef struct { + unsigned index; // group index + rocprofiler_feature_t** features; // profiling info array + uint32_t feature_count; // profiling info count + rocprofiler_t* context; // context object +} rocprofiler_group_t; + +// Profiling mode mask +typedef enum { + ROCPROFILER_MODE_STANDALONE = 1, // standalone mode when ROC profiler supports a queue + ROCPROFILER_MODE_CREATEQUEUE = 2, // ROC profiler creates queue in standalone mode + ROCPROFILER_MODE_SINGLEGROUP = 4 // only one group is allowed, failed otherwise +} rocprofiler_mode_t; + +// Profiling handler, calling on profiling completion +typedef bool (*rocprofiler_handler_t)(rocprofiler_group_t group, void* arg); + +// Profiling preperties +typedef struct { + hsa_queue_t* queue; // queue for STANDALONE mode + // the queue is created and returned in CREATEQUEUE mode + uint32_t queue_depth; // created queue depth + rocprofiler_handler_t handler; // handler on completion + void* handler_arg; // the handler arg +} rocprofiler_properties_t; + +// Create new profiling context +hsa_status_t rocprofiler_open(hsa_agent_t agent, // GPU handle + rocprofiler_feature_t* features, // [in] profiling features array + uint32_t feature_count, // profiling info count + rocprofiler_t** context, // [out] context object + uint32_t mode, // profiling mode mask + rocprofiler_properties_t* properties); // profiling properties + +// Add feature to e features set +hsa_status_t rocprofiler_add_feature(const rocprofiler_feature_t* feature, // [in] + rocprofiler_feature_set_t* features_set); // [in/out] profiling features set + +// Create new profiling context +hsa_status_t rocprofiler_features_set_open(hsa_agent_t agent, // GPU handle + rocprofiler_feature_set_t* features_set, // [in] profiling features set + rocprofiler_t** context, // [out] context object + uint32_t mode, // profiling mode mask + rocprofiler_properties_t* properties); // profiling properties + +// Delete profiling info +hsa_status_t rocprofiler_close(rocprofiler_t* context); // [in] profiling context + +// Context reset before reusing +hsa_status_t rocprofiler_reset(rocprofiler_t* context, // [in] profiling context + uint32_t group_index); // group index + +//////////////////////////////////////////////////////////////////////////////// +// Queue callbacks +// +// Queue callbacks for initiating profiling per kernel dispatch and to wait +// the profiling data on the queue destroy. + +// Dispatch record +typedef struct { + uint64_t dispatch; // dispatch timestamp + uint64_t begin; // begin timestamp + uint64_t end; // end timestamp + uint64_t complete; // completion signal timestamp +} rocprofiler_dispatch_record_t; + +// Profiling callback data +typedef struct { + hsa_agent_t agent; // GPU agent handle + uint32_t agent_index; // GPU index + const hsa_queue_t* queue; // HSA queue + uint64_t queue_index; // Index in the queue + const hsa_kernel_dispatch_packet_t* packet; // HSA dispatch packet + const char* kernel_name; // Kernel name + const rocprofiler_dispatch_record_t* record; // Dispatch record +} rocprofiler_callback_data_t; + +// Profiling callback type +typedef hsa_status_t (*rocprofiler_callback_t)( + const rocprofiler_callback_data_t* callback_data, // [in] callback data union, data depends on + // the callback API id + void* user_data, // [in/out] user data passed to the callback + rocprofiler_group_t* group); // [out] profiling group + +// Queue callbacks +typedef struct { + rocprofiler_callback_t dispatch; // dispatch callback + hsa_status_t (*destroy)(hsa_queue_t* queue, void* data); // destroy callback +} rocprofiler_queue_callbacks_t; + +// Set queue callbacks +hsa_status_t rocprofiler_set_queue_callbacks( + rocprofiler_queue_callbacks_t callbacks, // callbacks + void* data); // [in/out] passed callbacks data + +// Remove queue callbacks +hsa_status_t rocprofiler_remove_queue_callbacks(); + +//////////////////////////////////////////////////////////////////////////////// +// Start/stop profiling +// +// Start/stop the context profiling invocation, have to be as many as +// contect.invocations' to collect all profiling data + +// Start profiling +hsa_status_t rocprofiler_start(rocprofiler_t* context, // [in/out] profiling context + uint32_t group_index); // group index + +// Stop profiling +hsa_status_t rocprofiler_stop(rocprofiler_t* context, // [in/out] profiling context + uint32_t group_index); // group index + +// Read profiling +hsa_status_t rocprofiler_read(rocprofiler_t* context, // [in/out] profiling context + uint32_t group_index); // group index + +// Read profiling data +hsa_status_t rocprofiler_get_data(rocprofiler_t* context, // [in/out] profiling context + uint32_t group_index); // group index + +// Get profiling groups count +hsa_status_t rocprofiler_group_count(const rocprofiler_t* context, // [in] profiling context + uint32_t* group_count); // [out] profiling groups count + +// Get profiling group for a given index +hsa_status_t rocprofiler_get_group(rocprofiler_t* context, // [in] profiling context + uint32_t group_index, // profiling group index + rocprofiler_group_t* group); // [out] profiling group + +// Start profiling +hsa_status_t rocprofiler_group_start(rocprofiler_group_t* group); // [in/out] profiling group + +// Stop profiling +hsa_status_t rocprofiler_group_stop(rocprofiler_group_t* group); // [in/out] profiling group + +// Read profiling +hsa_status_t rocprofiler_group_read(rocprofiler_group_t* group); // [in/out] profiling group + +// Get profiling data +hsa_status_t rocprofiler_group_get_data(rocprofiler_group_t* group); // [in/out] profiling group + +// Get metrics data +hsa_status_t rocprofiler_get_metrics(const rocprofiler_t* context); // [in/out] profiling context + +// Definition of output data iterator callback +typedef hsa_ven_amd_aqlprofile_data_callback_t rocprofiler_trace_data_callback_t; + +// Method for iterating the events output data +hsa_status_t rocprofiler_iterate_trace_data( + rocprofiler_t* context, // [in] profiling context + rocprofiler_trace_data_callback_t callback, // callback to iterate the output data + void* data); // [in/out] callback data + +//////////////////////////////////////////////////////////////////////////////// +// Profiling features and data +// +// Profiling features objects have profiling feature info, type, parameters and data +// Also profiling data samplaes can be iterated using a callback + +// Profiling info kind +typedef enum { + ROCPROFILER_INFO_KIND_METRIC = 0, // metric info + ROCPROFILER_INFO_KIND_METRIC_COUNT = 1, // metric features count, int32 + ROCPROFILER_INFO_KIND_TRACE = 2, // trace info + ROCPROFILER_INFO_KIND_TRACE_COUNT = 3, // trace features count, int32 +} rocprofiler_info_kind_t; + +// Profiling info query +typedef union { + rocprofiler_info_kind_t info_kind; // queried profiling info kind + struct { + const char* trace_name; // queried info trace name + } trace_parameter; +} rocprofiler_info_query_t; + +// Profiling info data +typedef struct { + uint32_t agent_index; // GPU HSA agent index + rocprofiler_info_kind_t kind; // info data kind + union { + struct { + const char* name; // metric name + const char* expr; // metric expression, NULL for basic counters + const char* description; // metric description + } metric; + struct { + const char* name; // trace name + const char* description; // trace description + uint32_t parameter_count; // supported by the trace number parameters + } trace; + }; +} rocprofiler_info_data_t; + +// Return the info for a given info kind +hsa_status_t rocprofiler_get_info( + const hsa_agent_t* agent, // [in] GFXIP handle + rocprofiler_info_kind_t kind, // kind of iterated info + void *data); // [in/out] returned data + +// Iterate over the info for a given info kind, and invoke an application-defined callback on every iteration +hsa_status_t rocprofiler_iterate_info( + const hsa_agent_t* agent, // [in] GFXIP handle + rocprofiler_info_kind_t kind, // kind of iterated info + hsa_status_t (*callback)(const rocprofiler_info_data_t info, void *data), // callback + void *data); // [in/out] data passed to callback + +// Iterate over the info for a given info query, and invoke an application-defined callback on every iteration +hsa_status_t rocprofiler_query_info( + const hsa_agent_t *agent, // [in] GFXIP handle + rocprofiler_info_query_t query, // iterated info query + hsa_status_t (*callback)(const rocprofiler_info_data_t info, void *data), // callback + void *data); // [in/out] data passed to callback + +#ifdef __cplusplus +} // extern "C" block +#endif // __cplusplus + +#endif // INC_ROCPROFILER_H_ diff --git a/script/rpl_run.sh b/script/rpl_run.sh new file mode 100755 index 00000000..a8260e77 --- /dev/null +++ b/script/rpl_run.sh @@ -0,0 +1,377 @@ +################################################################################ +# Copyright (c) 2018 Advanced Micro Devices, Inc. All rights reserved. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +# THE SOFTWARE. +################################################################################ + +#!/bin/sh +time_stamp=`date +%y%m%d_%H%M%S` +BIN_DIR=`dirname $0` +BIN_DIR=`cd $BIN_DIR; pwd` +RUN_DIR=`pwd` +TMP_DIR="/tmp" +DATA_PATH=$TMP_DIR +DATA_DIR="rpl_data_${time_stamp}_$$" + +PKG_DIR=`echo $BIN_DIR | sed "s/\/bin\/*$//"` +BIN_DIR=$PKG_DIR/bin + +# PATH to custom HSA and OpenCl runtimes +HSA_PATH=$PKG_DIR/lib/hsa + +export LD_LIBRARY_PATH=$PKG_DIR/lib:$PKG_DIR/tool:$HSA_PATH +export PATH=.:$PATH + +# enable error logging +export HSA_TOOLS_REPORT_LOAD_FAILURE=1 +export HSA_VEN_AMD_AQLPROFILE_LOG=1 +export ROCPROFILER_LOG=1 + +# ROC Profiler environment +# Loading of ROC Profiler by HSA runtime +export HSA_TOOLS_LIB=librocprofiler64.so +# Loading of the test tool by ROC Profiler +export ROCP_TOOL_LIB=libtool.so +# Enabling HSA dispatches intercepting by ROC PRofiler +export ROCP_HSA_INTERCEPT=1 +# Disabling internal ROC Profiler proxy queue (simple version supported for testing purposes) +unset ROCP_PROXY_QUEUE +# ROC Profiler metrics definition +export ROCP_METRICS=$PKG_DIR/lib/metrics.xml +# ROC Profiler package path +export ROCP_PACKAGE_DIR=$PKG_DIR + +# error handling +fatal() { + echo "$0: Error: $1" + echo "" + usage +} + +error() { + echo "$0: Error: $1" + echo "" + exit 1 +} + +# usage method +usage() { + bin_name=`basename $0` + echo "ROCm Profiling Library (RPL) run script, a part of ROCprofiler library package." + echo "Full path: $BIN_DIR/$bin_name" + echo "Metrics definition: $PKG_DIR/lib/metrics.xml" + echo "" + echo "Usage:" + echo " rpl_run.sh [-h] [--list-basic] [--list-derived] [-i ] [-o ] " + echo "" + echo "Options:" + echo " -h - this help" + echo " --verbose - verbose mode, dumping all base counters used in the input metrics" + echo " --list-basic - to print the list of basic HW counters" + echo " --list-derived - to print the list of derived metrics with formulas" + echo "" + echo " -i <.txt|.xml file> - input file" + echo " Input file .txt format, automatically rerun application for every pmc/sqtt line:" + echo "" + echo " # Perf counters group 1" + echo " pmc : Wavefronts VALUInsts SALUInsts SFetchInsts FlatVMemInsts LDSInsts FlatLDSInsts GDSInsts VALUUtilization FetchSize" + echo " # Perf counters group 2" + echo " pmc : WriteSize L2CacheHit" + echo " # SQ tread trace" + echo " sqtt : MASK = 0x0F00 TOKEN_MASK = 0x144B TOKEN_MASK2 = 0xFFFF" + echo " # Filter by dispatches range, GPU index and kernel names" + echo " # supported range formats: \"3:9\", \"3:\", \"3\"" + echo " range: 1 : 4" + echo " gpu: 0 1 2 3" + echo " kernel: simple Pass1 simpleConvolutionPass2" + echo "" + echo " Input file .xml format, for single profiling run:" + echo "" + echo " # Metrics list definition, also the form \":\" can be used" + echo " # All defined metrics can be found in the 'metrics.xml'" + echo " # There are basic metrics for raw HW counters and high-level metrics for derived counters" + echo " " + echo "" + echo " # Trace enabling and the parameters definition" + echo " " + echo " " + echo " " + echo "" + echo " # Filter by dispatches range, GPU index and kernel names" + echo " " + echo "" + echo " Supported by profiler SQTT parameters:" + echo " TARGET_CU - target Compute Unit, MASK.CU_SEL field" + echo " VM_ID_MASK - select which VM IDs to capture, MASK.VM_ID_MASK field" + echo " MASK - MASK register value" + echo " TOKEN_MASK - TOKEN_MASK register value" + echo " TOKEN_MASK2 - TOKEN_MASK2 register value, traced instructions mask" + echo " The parameters defaults:" + echo " TARGET_CU = 0;" + echo " VM_ID_MASK = 0;" + echo " MASK:" + echo " mask.bits.CU_SEL = param{TARGET_CU};" + echo " mask.bits.SH_SEL = 0x0;" + echo " mask.bits.SIMD_EN = 0xF;" + echo " mask.bits.SQ_STALL_EN = 0x1;" + echo " mask.bits.SPI_STALL_EN = 0x1;" + echo " mask.bits.REG_STALL_EN = 0x1;" + echo " mask.bits.VM_ID_MASK = param{VM_ID_MASK};" + echo " TOKEN_MASK:" + echo " token_mask.bits.TOKEN_MASK = 0xFFFF;" + echo " token_mask.bits.REG_MASK = 0xFF;" + echo " token_mask.bits.REG_DROP_ON_STALL = 0x1;" + echo " TOKEN_MASK2:" + echo " token_mask2.bits.INST_MASK = 0xFFFFFF7F; // INST_PC is disabled because its tracing can cause extra stalling" + echo " // and it is recommended to disable by SQTT user guide" + echo " HIWATER = 6; // which is 6/8 fraction of the tread trace fifo" + echo "" + echo " -o - output CSV file [.csv]" + echo " -d - directory where profiler store profiling data including thread treaces [/tmp]" + echo " The data directory is renoving autonatically if the directory is matching the temporary one, which is the default." + echo " -t - to change the temporary directory [/tmp]" + echo " By changing the temporary directory you can prevent removing the profiling data from /tmp or enable removing from not '/tmp' directory." + echo "" + echo " --basenames - to turn on/off truncating of the kernel full function names till the base ones [off]" + echo " --timestamp - to turn on/off the kernel disoatches timestamps, dispatch/begin/end/complete [off]" + echo " --ctx-limit - maximum number of outstanding contexts [0 - unlimited]" + echo " --heartbeat - to turn on/off the kernel disoatches timestamps, dispatch/begin/end/complete [0 - disabled]" + echo " --sqtt-size - to set SQTT buffer size, aggregate for all SE [0x2000000]" + echo " Can be set in KB (1024B) or MB (1048576) units, examples 20K or 20M respectively." + echo " --sqtt-local - to allocate SQTT buffer in local GPU memory [on]" + echo "" + echo "Configuration file:" + echo " You can set your parameters defaults preferences in the configuration file 'rpl_rc.xml'. The search path sequence: .:${HOME}:" + echo" First the configuration file is looking in the current directory, then in your home, and then in the package directory." + echo " Configurable options: 'basenames', 'timestamp', 'ctx-limit', 'heartbeat', 'sqtt-size', 'sqtt-local'." + echo " An example of 'rpl_rc.xml':" + echo " " + echo "" + exit 1 +} + +# profiling run method +OUTPUT_LIST="" +run() { + export ROCP_INPUT="$1" + OUTPUT_DIR="$2" + shift + shift + APP_CMD=$* + + if [ "$OUTPUT_DIR" = "-" ] ; then + input_tag=`echo $ROCP_INPUT | sed "s/\.xml//"` + export ROCP_OUTPUT_DIR=${input_tag}_results_${time_stamp} + elif [ "$OUTPUT_DIR" = "--" ] ; then + unset ROCP_OUTPUT_DIR + else + export ROCP_OUTPUT_DIR=$OUTPUT_DIR + fi + echo "RPL: result dir '$ROCP_OUTPUT_DIR'" + + if [ ! -e "$ROCP_INPUT" ] ; then + error "Input file '$ROCP_INPUT' not found" + fi + + if [ -n "$ROCP_OUTPUT_DIR" ] ; then + if [ "$OUTPUT_DIR" = "-" ] ; then + if [ -e "$ROCP_OUTPUT_DIR" ] ; then + error "generated dir '$ROCP_OUTPUT_DIR' exists" + fi + fi + mkdir -p "$ROCP_OUTPUT_DIR" + fi + + if [ -n "$ROCP_OUTPUT_DIR" ] ; then + OUTPUT_LIST="$OUTPUT_LIST $ROCP_OUTPUT_DIR/results.txt" + eval "$APP_CMD 2>&1 | tee $ROCP_OUTPUT_DIR/log.txt" + else + eval "$APP_CMD" + fi +} + +# main +echo "RPL: on '$time_stamp' from '$PKG_DIR' at '$RUN_DIR'" +# Parsing arguments +if [ -z "$1" ] ; then + usage +fi + +INPUT_FILE="" +OUTPUT_DIR="-" +output="" +csv_output="" + +ARG_IN="" +while [ 1 ] ; do + ARG_IN=$1 + ARG_VAL=1 + if [ "$1" = "-h" ] ; then + usage + elif [ "$1" = "-i" ] ; then + INPUT_FILE="$2" + elif [ "$1" = "-o" ] ; then + output="$2" + elif [ "$1" = "-d" ] ; then + OUTPUT_DIR="$2" + DATA_PATH=$OUTPUT_DIR + elif [ "$1" = "-t" ] ; then + TMP_DIR="$2" + if [ "$OUTPUT_DIR" = "-" ] ; then + DATA_PATH=$TMP_DIR + fi + elif [ "$1" = "--list-basic" ] ; then + export ROCP_INFO=b + eval "$PKG_DIR/test/SimpleConvolution" + exit 1 + elif [ "$1" = "--list-derived" ] ; then + export ROCP_INFO=d + eval "$PKG_DIR/test/SimpleConvolution" + exit 1 + elif [ "$1" = "--basenames" ] ; then + if [ "$2" = "on" ] ; then + export ROCP_TRUNCATE_NAMES=1 + else + export ROCP_TRUNCATE_NAMES=0 + fi + elif [ "$1" = "--timestamp" ] ; then + if [ "$2" = "on" ] ; then + export ROCP_TRACKER_ON=1 + else + export ROCP_TRACKER_ON=0 + fi + elif [ "$1" = "--ctx-limit" ] ; then + export ROCP_OUTSTANDING_MAX="$2" + elif [ "$1" = "--heartbeat" ] ; then + export ROCP_OUTSTANDING_MON="$2" + elif [ "$1" = "--sqtt-size" ] ; then + size_m=`echo "$2" | sed -n "s/^\(.*\)M$/\1/p"` + size_k=`echo "$2" | sed -n "s/^\(.*\)K$/\1/p"` + if [ -n "$size_m" ] ; then size_b=$((size_m*1024*1024)) + elif [ -n "$size_k" ] ; then size_b=$((size_k*1024)) + else size_b=$2 + fi + export ROCP_SQTT_SIZE=$size_b + elif [ "$1" = "--sqtt-local" ] ; then + if [ "$2" = "on" ] ; then + export ROCP_SQTT_LOCAL=1 + else + export ROCP_SQTT_LOCAL=0 + fi + elif [ "$1" = "--verbose" ] ; then + ARG_VAL=0 + export ROCP_VERBOSE_MODE=1 + else + break + fi + shift + if [ "$ARG_VAL" = 1 ] ; then shift; fi +done + +ARG_CK=`echo $ARG_IN | sed "s/^-.*$/-/"` +if [ "$ARG_CK" = "-" ] ; then + fatal "Wrong option '$ARG_IN'" +fi + +if [ -z "$INPUT_FILE" ] ; then + fatal "Need input file" +fi + +input_base=`echo "$INPUT_FILE" | sed "s/^\(.*\)\.\([^\.]*\)$/\1/"` +input_type=`echo "$INPUT_FILE" | sed "s/^\(.*\)\.\([^\.]*\)$/\2/"` +if [ -z "${input_base}" -o -z "${input_type}" ] ; then + fatal "Bad input file '$INPUT_FILE'" +fi +input_base=`basename $input_base` + +if [ "$OUTPUT_DIR" = "--" ] ; then + fatal "Bad output dir '$OUTPUT_DIR'" +fi + +if [ -n "$output" ] ; then + if [ "$output" = "--" ] ; then + OUTPUT_DIR="--" + else + csv_output=$output + fi +else + csv_output=$RUN_DIR/${input_base}.csv +fi + +APP_CMD=$* + +echo "RPL: profiling '$APP_CMD'" +echo "RPL: input file '$INPUT_FILE'" + +input_list="" +RES_DIR="" +if [ "$input_type" = "xml" ] ; then + input_list=$INPUT_FILE +elif [ "$input_type" = "txt" ] ; then + OUTPUT_DIR="-" + RES_DIR=$DATA_PATH/$DATA_DIR + if [ -e $RES_DIR ] ; then + error "Rundir '$RES_DIR' exists" + fi + mkdir -p $RES_DIR + echo "RPL: output dir '$RES_DIR'" + $BIN_DIR/txt2xml.sh $INPUT_FILE $RES_DIR + input_list=`/bin/ls $RES_DIR/input*.xml` +else + fatal "Bad input file type '$INPUT_FILE'" +fi + +for name in $input_list; do + run $name $OUTPUT_DIR $APP_CMD +done + +if [ -n "$csv_output" ] ; then + python $BIN_DIR/tblextr.py $csv_output $OUTPUT_LIST + if [ "$?" = 1 ] ; then + error "CSV generation error, profiling results '$RES_DIR'" + fi + echo "RPL: '$csv_output' is generated" +fi + +if [ "$DATA_PATH" = "$TMP_DIR" ] ; then + if [ -e "$RES_DIR" ] ; then + rm -rf $RES_DIR + fi +fi + +exit 0 diff --git a/script/tblextr.py b/script/tblextr.py new file mode 100755 index 00000000..9a314db4 --- /dev/null +++ b/script/tblextr.py @@ -0,0 +1,119 @@ +################################################################################ +# Copyright (c) 2018 Advanced Micro Devices, Inc. All rights reserved. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +# THE SOFTWARE. +################################################################################ + +#!/usr/bin/python +import os, sys, re + +# Parsing results in the format: +#dispatch[0], queue_index(0), kernel_name("SimpleConvolution"), time(1048928000311041,1048928006154674,1048928006168274,1048928006170503): +# GRBM_GUI_ACTIVE (74332) +# SQ_WAVES (4096) +# SQ_INSTS_VMEM_RD (36864) + +# global vars +var_list = ['Index', 'KernelName', 'DispatchNs', 'BeginNs', 'EndNs', 'CompleteNs'] +var_table = {} +############################################################# + +def fatal(msg): + sys.stderr.write(sys.argv[0] + ": " + msg + "\n"); + sys.exit(1) +############################################################# + +# parse results method +def parse_res(infile): + if not os.path.isfile(infile): fatal("Error: input file '" + infile + "' not found") + inp = open(infile, 'r') + + beg_pattern = re.compile("^dispatch\[(\d*)\], queue_index\(\d*\), kernel_name\(\"([^\"]*)\"\)") + ts_pattern = re.compile(", time\((\d*),(\d*),(\d*),(\d*)\)") + var_pattern = re.compile("^\s*([^\s]*)\s+\((\d*)\)") + + dispatch_number = 0 + for line in inp.readlines(): + record = line[:-1] + + m = var_pattern.match(record) + if m: + if not dispatch_number in var_table: fatal("Error: dispatch number not unique '" + str(dispatch_number) + "'") + var = m.group(1) + val = m.group(2) + var_table[dispatch_number][m.group(1)] = m.group(2) + if not var in var_list: var_list.append(var) + + m = beg_pattern.match(record) + if m: + dispatch_number = m.group(1) + if not dispatch_number in var_table: + var_table[dispatch_number] = { + 'Index': dispatch_number, + 'KernelName': "\"" + m.group(2) + "\"" + } + m = ts_pattern.search(record) + if m: + var_table[dispatch_number]['DispatchNs'] = m.group(1) + var_table[dispatch_number]['BeginNs'] = m.group(2) + var_table[dispatch_number]['EndNs'] = m.group(3) + var_table[dispatch_number]['CompleteNs'] = m.group(4) + + inp.close() +############################################################# + +# print results table method +def print_tbl(outfile): + global var_list + + out = open(outfile, 'w') + + keys = var_table.keys() + keys.sort(key=int) + + entry = var_table[keys[0]] + list1 = [] + for var in var_list: + if var in entry: + list1.append(var) + var_list = list1 + + for var in var_list: out.write(var + ',') + out.write("\n") + + for ind in keys: + entry = var_table[ind] + dispatch_number = entry['Index'] + if ind != dispatch_number: fatal("Dispatch #" + ind + " index mismatch (" + dispatch_number + ")\n") + for var in var_list: out.write(entry[var] + ',') + out.write("\n") + + out.close() +############################################################# + +# main +if (len(sys.argv) < 3): fatal("Usage: " + sys.argv[0] + " ") + +outfile = sys.argv[1] +infiles = sys.argv[2:] +for f in infiles : + parse_res(f) +print_tbl(outfile) +sys.exit(0) +############################################################# diff --git a/script/txt2xml.sh b/script/txt2xml.sh new file mode 100755 index 00000000..57cb4be7 --- /dev/null +++ b/script/txt2xml.sh @@ -0,0 +1,94 @@ +################################################################################ +# Copyright (c) 2018 Advanced Micro Devices, Inc. All rights reserved. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +# THE SOFTWARE. +################################################################################ + +#!/bin/bash +timestamp=`date +%y%m%d_%H%M%S` + +if [ $# = 0 ] ; then + echo "Usage: $0 [output dir]" + exit -1 +fi + +input=$1 +outdir=$2 +if [ -z "$outdir" ] ; then + outdir="." +fi + +range="" +kernel="" +gpu_index="" + +parse() { + scan="$1" + index=0 + while read -r line ; do + line=`echo $line | sed "s/\s*#.*$//"` + if [ -z "$line" ] ; then + continue + fi + + feature=`echo $line | sed -n "s/^\s*\([a-z]*\)\s*:.*$/\1/p"` + line=`echo $line | sed "s/^[^:]*:\s*//"` + line=`echo "$line" | sed -e "s/\s*=\s*/=/g" -e "s/\s*:\s*/:/g" -e "s/,\{1,\}/ /g" -e "s/\s\{1,\}/ /g" -e "s/\s*$//"` + + if [ "$scan" = 0 ] ; then + line=`echo "$line" | sed -e "s/ /,/g"` + if [ "$feature" == "range" ] ; then + range=$line + fi + if [ "$feature" == "kernel" ] ; then + kernel=$line + fi + if [ "$feature" == "gpu" ] ; then + gpu_index=$line + fi + else + output=$outdir/input${index}.xml + header="# $timestamp '$output' generated with '$0 $*'" + + if [ "$feature" == "pmc" ] ; then + line=`echo "$line" | sed -e "s/ /,/g"` + cat >> $output < + +EOF + fi + + if [ "$feature" == "sqtt" ] ; then + cat >> $output < + +EOF + fi + fi + + index=$((index + 1)) + done < $input +} + +parse 0 +parse 1 + +exit 0 diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt new file mode 100644 index 00000000..45bc2719 --- /dev/null +++ b/src/CMakeLists.txt @@ -0,0 +1,37 @@ +################################################################################ +# Copyright (c) 2018 Advanced Micro Devices, Inc. All rights reserved. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +# THE SOFTWARE. +################################################################################ + +# +# Build dynamic Library object +# +set ( TARGET_LIB "${TARGET_NAME}" ) +set ( LIB_SRC + ${LIB_DIR}/core/rocprofiler.cpp + ${LIB_DIR}/core/proxy_queue.cpp + ${LIB_DIR}/core/simple_proxy_queue.cpp + ${LIB_DIR}/core/intercept_queue.cpp + ${LIB_DIR}/core/metrics.cpp + ${LIB_DIR}/util/hsa_rsrc_factory.cpp +) +add_library ( ${TARGET_LIB} SHARED ${LIB_SRC} ) +target_include_directories ( ${TARGET_LIB} PRIVATE ${LIB_DIR} ${ROOT_DIR} ${HSA_RUNTIME_INC_PATH} ) +target_link_libraries( ${TARGET_LIB} PRIVATE ${HSA_RUNTIME_LIB} c stdc++) diff --git a/src/core/context.h b/src/core/context.h new file mode 100644 index 00000000..966acaef --- /dev/null +++ b/src/core/context.h @@ -0,0 +1,546 @@ +/****************************************************************************** +Copyright (c) 2018 Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*******************************************************************************/ + +#ifndef SRC_CORE_CONTEXT_H_ +#define SRC_CORE_CONTEXT_H_ + +#include "inc/rocprofiler.h" + +#include +#include +#include +#include +#include + +#include "core/metrics.h" +#include "core/profile.h" +#include "core/queue.h" +#include "core/types.h" +#include "util/exception.h" +#include "util/hsa_rsrc_factory.h" +#include "util/logger.h" + +namespace rocprofiler { +struct rocprofiler_contex_t; +class Context; + +inline unsigned align_size(unsigned size, unsigned alignment) { + return ((size + alignment - 1) & ~(alignment - 1)); +} + +// Block descriptor +struct block_des_t { + uint32_t id; + uint32_t index; +}; + +// block_des_t less-then functor +struct lt_block_des { + bool operator()(const block_des_t& a1, const block_des_t& a2) const { + return (a1.id < a2.id) || ((a1.id == a2.id) && (a1.index < a2.index)); + } +}; + +// Block status +struct block_status_t { + uint32_t max_counters; + uint32_t counter_index; + uint32_t group_index; +}; + +// Metrics arguments +template class MetricArgs : public xml::args_cache_t { + public: + MetricArgs(const Map& map) : map_(map) {} + bool Lookup(const std::string& name, uint64_t& result) const { + rocprofiler_feature_t* info = NULL; + auto it = map_.find(name); + if (it == map_.end()) EXC_RAISING(HSA_STATUS_ERROR, "var '" << name << "' is not found"); + info = it->second; + if (info) { + result = info->data.result_int64; + if (info->data.kind == ROCPROFILER_DATA_KIND_UNINIT) + EXC_RAISING(HSA_STATUS_ERROR, "var '" << name << "' is uninitialized"); + if (info->data.kind != ROCPROFILER_DATA_KIND_INT64) + EXC_RAISING(HSA_STATUS_ERROR, "var '" << name << "' is of incompatible type, not INT64"); + } else + EXC_RAISING(HSA_STATUS_ERROR, "var '" << name << "' info is NULL"); + return (info != NULL); + } + + private: + const Map& map_; +}; + +// Profiling group +class Group { + public: + Group(const util::AgentInfo* agent_info, Context* context, const uint32_t& index) + : pmc_profile_(agent_info), + sqtt_profile_(agent_info), + n_profiles_(0), + refs_(1), + context_(context), + index_(index) {} + + void Insert(const profile_info_t& info) { + const rocprofiler_feature_kind_t kind = info.rinfo->kind; + info_vector_.push_back(info.rinfo); + switch (kind) { + case ROCPROFILER_FEATURE_KIND_METRIC: + pmc_profile_.Insert(info); + break; + case ROCPROFILER_FEATURE_KIND_TRACE: + sqtt_profile_.Insert(info); + break; + default: + EXC_RAISING(HSA_STATUS_ERROR, "bad rocprofiler feature kind (" << kind << ")"); + } + } + + hsa_status_t Finalize() { + hsa_status_t status = pmc_profile_.Finalize(start_vector_, stop_vector_, read_vector_); + if (status == HSA_STATUS_SUCCESS) { + status = sqtt_profile_.Finalize(start_vector_, stop_vector_, read_vector_); + } + if (status == HSA_STATUS_SUCCESS) { + if (!pmc_profile_.Empty()) ++n_profiles_; + if (!sqtt_profile_.Empty()) ++n_profiles_; + } + return status; + } + + void GetProfiles(profile_vector_t& vec) { + pmc_profile_.GetProfiles(vec); + sqtt_profile_.GetProfiles(vec); + } + + void GetTraceProfiles(profile_vector_t& vec) { sqtt_profile_.GetProfiles(vec); } + + info_vector_t& GetInfoVector() { return info_vector_; } + const pkt_vector_t& GetStartVector() const { return start_vector_; } + const pkt_vector_t& GetStopVector() const { return stop_vector_; } + const pkt_vector_t& GetReadVector() const { return read_vector_; } + Context* GetContext() { return context_; } + uint32_t GetIndex() const { return index_; } + + void ResetRefs() { refs_ = n_profiles_; } + uint32_t DecrRefs() { + return (refs_ > 0) ? --refs_ : 0; + } + + private: + PmcProfile pmc_profile_; + SqttProfile sqtt_profile_; + info_vector_t info_vector_; + pkt_vector_t start_vector_; + pkt_vector_t stop_vector_; + pkt_vector_t read_vector_; + uint32_t n_profiles_; + uint32_t refs_; + Context* const context_; + const uint32_t index_; +}; + +// Profiling context +class Context { + public: + typedef std::mutex mutex_t; + typedef std::map info_map_t; + + Context(const util::AgentInfo* agent_info, Queue* queue, rocprofiler_feature_t* info, + const uint32_t info_count, rocprofiler_handler_t handler, void* handler_arg) + : agent_(agent_info->dev_id), + agent_info_(agent_info), + queue_(queue), + hsa_rsrc_(&util::HsaRsrcFactory::Instance()), + api_(hsa_rsrc_->AqlProfileApi()), + handler_(handler), + handler_arg_(handler_arg) + { + metrics_ = MetricsDict::Create(agent_info); + if (metrics_ == NULL) EXC_RAISING(HSA_STATUS_ERROR, "MetricsDict create failed"); + Initialize(info, info_count); + Finalize(); + + if (handler != NULL) { + for (unsigned group_index = 0; group_index < set_.size(); ++group_index) { + set_[group_index].ResetRefs(); + const profile_vector_t profile_vector = GetProfiles(group_index); + for (auto& tuple : profile_vector) { + // Handler for stop packet completion + hsa_amd_signal_async_handler(tuple.completion_signal, HSA_SIGNAL_CONDITION_LT, 1, Handler, + &set_[group_index]); + } + } + } + } + + ~Context() { + for (const auto& v : info_map_) { + const std::string& name = v.first; + const rocprofiler_feature_t* info = v.second; + if ((info->kind == ROCPROFILER_FEATURE_KIND_METRIC) && + (metrics_map_.find(name) == metrics_map_.end())) { + delete info; + } + } + } + + // Initialize rocprofiler context + void Initialize(rocprofiler_feature_t* info_array, const uint32_t info_count) { + // Register input features to not duplicate by features referencing + for (unsigned i = 0; i < info_count; ++i) { + rocprofiler_feature_t* info = &info_array[i]; + if (!info->name) EXC_RAISING(HSA_STATUS_ERROR, "input feature name is NULL"); + info_map_[info->name] = info; + } + + // Adding zero group, always present + if (info_count) set_.push_back(Group(agent_info_, this, 0)); + + // Processing input features + for (unsigned i = 0; i < info_count; ++i) { + rocprofiler_feature_t* info = &info_array[i]; + const rocprofiler_feature_kind_t kind = info->kind; + const char* name = info->name; + + if (kind == ROCPROFILER_FEATURE_KIND_METRIC) { // Processing metrics features + const Metric* metric = metrics_->Get(name); + if (metric == NULL) + EXC_RAISING(HSA_STATUS_ERROR, "input metric '" << name << "' is not found"); +#if 0 + std::cout << " " << name << (metric->GetExpr() ? " = " + metric->GetExpr()->String() : " counter") << std::endl; +#endif + + auto ret = metrics_map_.insert({name, metric}); + if (!ret.second) + EXC_RAISING(HSA_STATUS_ERROR, "input metric '" << name + << "' is registered more then once"); + + counters_vec_t counters_vec = metric->GetCounters(); + if (counters_vec.empty()) + EXC_RAISING(HSA_STATUS_ERROR, "bad metric '" << name << "' is empty"); + + for (const counter_t* counter : counters_vec) { + // For metrics expressions checking that there is no the same counter in the input metrics + // and also that the counter wasn't registered already by another input metric expression + if (metric->GetExpr()) { + if (info_map_.find(counter->name) != info_map_.end()) { + continue; + } else { + info = NewCounterInfo(counter); + info_map_[info->name] = info; + } + } + + const event_t* event = &(counter->event); + const block_des_t block_des = {event->block_name, event->block_index}; + auto ret = groups_map_.insert({block_des, {}}); + block_status_t& block_status = ret.first->second; + if (block_status.max_counters == 0) { + profile_t query = {}; + query.agent = agent_; + query.type = HSA_VEN_AMD_AQLPROFILE_EVENT_TYPE_PMC; + query.events = event; + + uint32_t block_counters; + hsa_status_t status = api_->hsa_ven_amd_aqlprofile_get_info( + &query, HSA_VEN_AMD_AQLPROFILE_INFO_BLOCK_COUNTERS, &block_counters); + if (status != HSA_STATUS_SUCCESS) AQL_EXC_RAISING(status, "get block_counters info"); + block_status.max_counters = block_counters; + } + if (block_status.counter_index >= block_status.max_counters) { + block_status.counter_index = 0; + block_status.group_index += 1; + } + if (block_status.group_index >= set_.size()) { + set_.push_back(Group(agent_info_, this, block_status.group_index)); + } + const uint32_t group_index = block_status.group_index; + set_[group_index].Insert(profile_info_t{event, NULL, 0, info}); + } + } else if (kind == ROCPROFILER_FEATURE_KIND_TRACE) { // Processing traces features + set_[0].Insert(profile_info_t{NULL, info->parameters, info->parameter_count, info}); + } else { + EXC_RAISING(HSA_STATUS_ERROR, "bad rocprofiler feature kind (" << kind << ")"); + } + } + } + + void Finalize() { + for (unsigned index = 0; index < set_.size(); ++index) { + const hsa_status_t status = set_[index].Finalize(); + if (status != HSA_STATUS_SUCCESS) EXC_RAISING(status, "context finalize failed"); + } + } + + void Reset(const uint32_t& group_index) { set_[group_index].ResetRefs(); } + + uint32_t GetGroupCount() const { return set_.size(); } + + rocprofiler_group_t GetGroupInfo(Group* g) { + rocprofiler::info_vector_t& info_vector = g->GetInfoVector(); + rocprofiler_group_t group = {}; + group.index = g->GetIndex(); + group.context = reinterpret_cast(this); + group.features = &info_vector[0]; + group.feature_count = info_vector.size(); + return group; + } + rocprofiler_group_t GetGroupInfo(const uint32_t& index) { + return GetGroupInfo(&set_[index]); + } + + const pkt_vector_t& StartPackets(const uint32_t& group_index) const { + return set_[group_index].GetStartVector(); + } + const pkt_vector_t& StopPackets(const uint32_t& group_index) const { + return set_[group_index].GetStopVector(); + } + const pkt_vector_t& ReadPackets(const uint32_t& group_index) const { + return set_[group_index].GetReadVector(); + } + + void Start(const uint32_t& group_index, Queue* const queue = NULL) { + const pkt_vector_t& start_packets = StartPackets(group_index); + Queue* const submit_queue = (queue != NULL) ? queue : queue_; + submit_queue->Submit(&start_packets[0], start_packets.size()); + } + void Stop(const uint32_t& group_index, Queue* const queue = NULL) { + const pkt_vector_t& stop_packets = StopPackets(group_index); + Queue* const submit_queue = (queue != NULL) ? queue : queue_; + submit_queue->Submit(&stop_packets[0], stop_packets.size()); + } + void Read(const uint32_t& group_index, Queue* const queue = NULL) { + const pkt_vector_t& read_packets = ReadPackets(group_index); + if (read_packets.size() == 0) EXC_RAISING(HSA_STATUS_ERROR, "Read API disabled"); + Queue* const submit_queue = (queue != NULL) ? queue : queue_; + submit_queue->Submit(&read_packets[0], read_packets.size()); + } + void Submit(const uint32_t& group_index, const packet_t* packet, Queue* const queue = NULL) { + Queue* const submit_queue = (queue != NULL) ? queue : queue_; + Start(group_index, submit_queue); + submit_queue->Submit(packet); + Stop(group_index, submit_queue); + } + + struct callback_data_t { + const profile_t* profile; + info_vector_t* info_vector; + size_t index; + char* ptr; + }; + + void GetData(const uint32_t& group_index) { + const profile_vector_t profile_vector = GetProfiles(group_index); + for (auto& tuple : profile_vector) { + // Wait for stop packet to complete + const uint64_t timeout = timeout_; + bool complete = false; + while (!complete) { + const hsa_signal_value_t signal_value = hsa_signal_wait_scacquire(tuple.completion_signal, HSA_SIGNAL_CONDITION_LT, 1, timeout, + HSA_WAIT_STATE_BLOCKED); + complete = (signal_value < 1); + if (!complete) WARN_LOGGING("timeout"); + } + for (rocprofiler_feature_t* rinfo : *(tuple.info_vector)) rinfo->data.kind = ROCPROFILER_DATA_KIND_UNINIT; + callback_data_t callback_data{tuple.profile, tuple.info_vector, tuple.info_vector->size(), NULL}; + const hsa_status_t status = + api_->hsa_ven_amd_aqlprofile_iterate_data(tuple.profile, DataCallback, &callback_data); + if (status != HSA_STATUS_SUCCESS) AQL_EXC_RAISING(status, "context iterate data failed"); + } + } + + void GetMetricsData() const { + const MetricArgs args(info_map_); + for (const auto v : metrics_map_) { + const std::string& name = v.first; + const Metric* metric = v.second; + const xml::Expr* expr = metric->GetExpr(); + if (expr) { + auto it = info_map_.find(name); + if (it == info_map_.end()) + EXC_RAISING(HSA_STATUS_ERROR, "metric '" << name << "', rocprofiler info is not found " << this); + rocprofiler_feature_t* info = it->second; + info->data.result_int64 = expr->Eval(args); + info->data.kind = ROCPROFILER_DATA_KIND_INT64; + } + } + } + + void IterateTraceData(rocprofiler_trace_data_callback_t callback, void* data) { + profile_vector_t profile_vector; + set_[0].GetTraceProfiles(profile_vector); + for (auto& tuple : profile_vector) { + const hsa_status_t status = + api_->hsa_ven_amd_aqlprofile_iterate_data(tuple.profile, callback, data); + if (status != HSA_STATUS_SUCCESS) AQL_EXC_RAISING(status, "context iterate data failed"); + } + } + + static void SetTimeout(uint64_t timeout) { timeout_ = timeout; } + static uint64_t GetTimeout() { return timeout_; } + + private: + // Getting profling packets + profile_vector_t GetProfiles(const uint32_t& index) { + profile_vector_t vec; + if (index >= set_.size()) { + EXC_RAISING(HSA_STATUS_ERROR, "index exceeding the maximum " << set_.size()); + } + set_[index].GetProfiles(vec); + return vec; + } + + static bool Handler(hsa_signal_value_t value, void* arg) { + Group* group = reinterpret_cast(arg); + Context* context = group->GetContext(); + context->mutex_.lock(); + uint32_t r = group->DecrRefs(); + context->mutex_.unlock(); + if (r == 0) { + return context->handler_(context->GetGroupInfo(group), context->handler_arg_); + } + return false; + } + + static hsa_status_t DataCallback(hsa_ven_amd_aqlprofile_info_type_t ainfo_type, + hsa_ven_amd_aqlprofile_info_data_t* ainfo_data, void* data) { + hsa_status_t status = HSA_STATUS_SUCCESS; + callback_data_t* callback_data = reinterpret_cast(data); + const profile_t* profile = callback_data->profile; + info_vector_t& info_vector = *(callback_data->info_vector); + uint32_t index = callback_data->index; + const uint32_t sample_id = ainfo_data->sample_id; + if (info_vector.size() == index) { + index = 0; + } else { + if (sample_id == 0) index += 1; + } + callback_data->index = index; + + if (index < info_vector.size()) { + rocprofiler_feature_t* const rinfo = info_vector[index]; + rinfo->data.kind = ROCPROFILER_DATA_KIND_UNINIT; + + if (ainfo_type == HSA_VEN_AMD_AQLPROFILE_INFO_PMC_DATA) { + if (ainfo_data->sample_id == 0) rinfo->data.result_int64 = 0; + rinfo->data.result_int64 += ainfo_data->pmc_data.result; + rinfo->data.kind = ROCPROFILER_DATA_KIND_INT64; + } else if (ainfo_type == HSA_VEN_AMD_AQLPROFILE_INFO_SQTT_DATA) { + if (rinfo->data.result_bytes.copy) { + const bool sqtt_local = SqttProfile::IsLocal(); + util::HsaRsrcFactory* hsa_rsrc = &util::HsaRsrcFactory::Instance(); + if (sample_id == 0) { + const uint32_t output_buffer_size = profile->output_buffer.size; + const uint32_t output_buffer_size64 = profile->output_buffer.size / sizeof(uint64_t); + const util::AgentInfo* agent_info = hsa_rsrc->GetAgentInfo(profile->agent); + void* ptr = (sqtt_local) ? hsa_rsrc->AllocateSysMemory(agent_info, output_buffer_size) : + calloc(output_buffer_size64, sizeof(uint64_t)); + rinfo->data.result_bytes.size = output_buffer_size; + rinfo->data.result_bytes.ptr = ptr; + callback_data->ptr = reinterpret_cast(ptr); + } + char* result_bytes_ptr = reinterpret_cast(rinfo->data.result_bytes.ptr); + const char* end = result_bytes_ptr + rinfo->data.result_bytes.size; + const char* src = reinterpret_cast(ainfo_data->sqtt_data.ptr); + uint32_t size = ainfo_data->sqtt_data.size; + char* ptr = callback_data->ptr; + uint32_t* header = reinterpret_cast(ptr); + char* dest = ptr + sizeof(*header); + + if ((dest + size) >= end) { + if (dest < end) size = end - dest; + else EXC_RAISING(HSA_STATUS_ERROR, "SQTT data out of output buffer"); + } + + bool suc = true; + if (sqtt_local) { + suc = hsa_rsrc->Memcpy(profile->agent, dest, src, size); + } else { + memcpy(dest, src, size); + } + if (suc) { + *header = size; + callback_data->ptr = dest + align_size(size, sizeof(uint32_t)); + rinfo->data.result_bytes.instance_count = sample_id + 1; + rinfo->data.kind = ROCPROFILER_DATA_KIND_BYTES; + } else + EXC_RAISING(HSA_STATUS_ERROR, "Agent Memcpy failed, dst(" << (void*)dest << ") src(" << (void*)src << ") size(" << size << ")"); + } else { + if (sample_id == 0) { + rinfo->data.result_bytes.ptr = profile->output_buffer.ptr; + rinfo->data.result_bytes.size = profile->output_buffer.size; + rinfo->data.result_bytes.instance_count = UINT32_MAX; + } + + rinfo->data.result_bytes.instance_count += 1; + rinfo->data.kind = ROCPROFILER_DATA_KIND_BYTES; + } + } else { + EXC_RAISING(HSA_STATUS_ERROR, "unknown data type = " << ainfo_type); + } + } else + status = HSA_STATUS_ERROR; + + return status; + } + + rocprofiler_feature_t* NewCounterInfo(const counter_t* counter) { + rocprofiler_feature_t* info = new rocprofiler_feature_t{}; + info->kind = ROCPROFILER_FEATURE_KIND_METRIC; + info->name = counter->name.c_str(); + return info; + } + + // Profiling data waiting timeout + static uint64_t timeout_; + + // GPU handel + const hsa_agent_t agent_; + const util::AgentInfo* agent_info_; + // Profiling queue + Queue* queue_; + // HSA resources factory + util::HsaRsrcFactory* hsa_rsrc_; + // aqlprofile API table + const pfn_t* api_; + // Profile group set + std::vector set_; + // Metrics dictionary + const MetricsDict* metrics_; + // Groups map + std::map groups_map_; + // Info map + info_map_t info_map_; + // Metrics map + std::map metrics_map_; + // Context completion handler + rocprofiler_handler_t handler_; + void* handler_arg_; + mutex_t mutex_; +}; + +} // namespace rocprofiler + +#endif // SRC_CORE_CONTEXT_H_ diff --git a/src/core/hsa_proxy_queue.h b/src/core/hsa_proxy_queue.h new file mode 100644 index 00000000..dd4999b9 --- /dev/null +++ b/src/core/hsa_proxy_queue.h @@ -0,0 +1,67 @@ +/****************************************************************************** +Copyright (c) 2018 Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*******************************************************************************/ + +#ifndef _SRC_CORE_HSA_PROXY_QUEUE_H +#define _SRC_CORE_HSA_PROXY_QUEUE_H + +#include +#include +#include +#include + +#include "core/proxy_queue.h" +#include "util/exception.h" + +namespace rocprofiler { +extern decltype(hsa_queue_destroy)* hsa_queue_destroy_fn; +extern decltype(hsa_amd_queue_intercept_create)* hsa_amd_queue_intercept_create_fn; +extern decltype(hsa_amd_queue_intercept_register)* hsa_amd_queue_intercept_register_fn; + +class HsaProxyQueue : public ProxyQueue { + public: + hsa_status_t SetInterceptCB(on_submit_cb_t on_submit_cb, void* data) { + return hsa_amd_queue_intercept_register_fn(queue_, on_submit_cb, data); + } + + void Submit(const packet_t* packet) { + EXC_RAISING(HSA_STATUS_ERROR, "HsaProxyQueue::Submit() is not supported"); + } + + private: + hsa_status_t Init(hsa_agent_t agent, uint32_t size, hsa_queue_type32_t type, + void (*callback)(hsa_status_t status, hsa_queue_t* source, void* data), + void* data, uint32_t private_segment_size, uint32_t group_segment_size, + hsa_queue_t** queue) { + const auto status = hsa_amd_queue_intercept_create_fn( + agent, size, type, callback, data, private_segment_size, group_segment_size, &queue_); + *queue = queue_; + return status; + } + + hsa_status_t Cleanup() const { return hsa_queue_destroy_fn(queue_); } + + hsa_queue_t* queue_; +}; + +} // namespace rocprofiler + +#endif // _SRC_CORE_HSA_PROXY_QUEUE_H diff --git a/src/core/hsa_queue.h b/src/core/hsa_queue.h new file mode 100644 index 00000000..620f6224 --- /dev/null +++ b/src/core/hsa_queue.h @@ -0,0 +1,80 @@ +/****************************************************************************** +Copyright (c) 2018 Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*******************************************************************************/ + +#ifndef _SRC_CORE_HSA_QUEUE_H +#define _SRC_CORE_HSA_QUEUE_H + +#include + +#include "core/queue.h" +#include "core/types.h" + +namespace rocprofiler { + +class HsaQueue : public Queue { + public: + typedef void (HsaQueue::*submit_fptr_t)(const packet_t* packet); + enum { + LEGACY_SLOT_SIZE_W = HSA_VEN_AMD_AQLPROFILE_LEGACY_PM4_PACKET_SIZE / sizeof(packet_word_t), + LEGACY_SLOT_SIZE_P = HSA_VEN_AMD_AQLPROFILE_LEGACY_PM4_PACKET_SIZE / sizeof(packet_t) + }; + struct slot_pm4_t { + packet_word_t words[LEGACY_SLOT_SIZE_W]; + }; + + HsaQueue(const util::AgentInfo* agent_info, hsa_queue_t* queue) : queue_(queue) {} + + void Submit(const packet_t* packet) { + // Compute the write index of queue and copy Aql packet into it + const uint64_t que_idx = hsa_queue_load_write_index_relaxed(queue_); + // Increment the write index + hsa_queue_store_write_index_relaxed(queue_, que_idx + 1); + + const uint32_t mask = queue_->size - 1; + + // Copy packet to the queue + const packet_word_t* src = reinterpret_cast(packet); + packet_t* slot = reinterpret_cast(queue_->base_address) + (que_idx & mask); + packet_word_t* dst = reinterpret_cast(slot); + const uint32_t nwords = sizeof(packet_t) / sizeof(packet_word_t); + for (unsigned i = 1; i < nwords; ++i) { + dst[i] = src[i]; + } + + // To maintain global order to ensure the prior copy of the packet contents is made visible + // before the header is updated. + // With in-order CP it will wait until the first packet in the blob will be valid + std::atomic* header_atomic_ptr = + reinterpret_cast*>(&dst[0]); + header_atomic_ptr->store(src[0], std::memory_order_release); + + // Doorbell signaling + hsa_signal_store_relaxed(queue_->doorbell_signal, que_idx); + } + + private: + hsa_queue_t* queue_; +}; + +} // namespace rocprofiler + +#endif // _SRC_CORE_HSA_QUEUE_H diff --git a/src/core/intercept_queue.cpp b/src/core/intercept_queue.cpp new file mode 100644 index 00000000..a2a289aa --- /dev/null +++ b/src/core/intercept_queue.cpp @@ -0,0 +1,40 @@ +/****************************************************************************** +Copyright (c) 2018 Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*******************************************************************************/ + +#include "core/intercept_queue.h" + +namespace rocprofiler { +void InterceptQueue::HsaIntercept(HsaApiTable* table) { + table->core_->hsa_queue_create_fn = rocprofiler::InterceptQueue::QueueCreate; + table->core_->hsa_queue_destroy_fn = rocprofiler::InterceptQueue::QueueDestroy; +} + +InterceptQueue::mutex_t InterceptQueue::mutex_; +rocprofiler_callback_t InterceptQueue::dispatch_callback_ = NULL; +InterceptQueue::queue_callback_t InterceptQueue::destroy_callback_ = NULL; +void* InterceptQueue::callback_data_ = NULL; +InterceptQueue::obj_map_t* InterceptQueue::obj_map_ = NULL; +const char* InterceptQueue::kernel_none_ = ""; +uint64_t InterceptQueue::timeout_ = UINT64_MAX; +Tracker* InterceptQueue::tracker_ = NULL; +bool InterceptQueue::tracker_on_ = false; +} // namespace rocprofiler diff --git a/src/core/intercept_queue.h b/src/core/intercept_queue.h new file mode 100644 index 00000000..c5376bb9 --- /dev/null +++ b/src/core/intercept_queue.h @@ -0,0 +1,230 @@ +/****************************************************************************** +Copyright (c) 2018 Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*******************************************************************************/ + +#ifndef _SRC_CORE_INTERCEPT_QUEUE_H +#define _SRC_CORE_INTERCEPT_QUEUE_H + +#include +#include +#include + +#include +#include +#include +#include + +#include "core/context.h" +#include "core/proxy_queue.h" +#include "core/tracker.h" +#include "core/types.h" +#include "inc/rocprofiler.h" +#include "util/hsa_rsrc_factory.h" + +namespace rocprofiler { +extern decltype(hsa_queue_create)* hsa_queue_create_fn; +extern decltype(hsa_queue_destroy)* hsa_queue_destroy_fn; + +class InterceptQueue { + public: + typedef std::recursive_mutex mutex_t; + typedef std::map obj_map_t; + typedef hsa_status_t (*queue_callback_t)(hsa_queue_t*, void* data); + + static void HsaIntercept(HsaApiTable* table); + + static hsa_status_t QueueCreate(hsa_agent_t agent, uint32_t size, hsa_queue_type32_t type, + void (*callback)(hsa_status_t status, hsa_queue_t* source, + void* data), + void* data, uint32_t private_segment_size, + uint32_t group_segment_size, hsa_queue_t** queue) { + hsa_status_t status = HSA_STATUS_ERROR; + std::lock_guard lck(mutex_); + + ProxyQueue* proxy = ProxyQueue::Create(agent, size, type, callback, data, private_segment_size, + group_segment_size, queue, &status); + if (status != HSA_STATUS_SUCCESS) abort(); + + if (tracker_on_ && (tracker_ == NULL)) { + tracker_ = new Tracker(timeout_); + status = hsa_amd_profiling_set_profiler_enabled(*queue, true); + if (status != HSA_STATUS_SUCCESS) abort(); + } + + if (!obj_map_) obj_map_ = new obj_map_t; + InterceptQueue* obj = new InterceptQueue(agent, *queue, proxy); + (*obj_map_)[(uint64_t)(*queue)] = obj; + status = proxy->SetInterceptCB(OnSubmitCB, obj); + + return status; + } + + static hsa_status_t QueueDestroy(hsa_queue_t* queue) { + std::lock_guard lck(mutex_); + hsa_status_t status = HSA_STATUS_ERROR; + + if (destroy_callback_ != NULL) { + status = destroy_callback_(queue, callback_data_); + if (status != HSA_STATUS_SUCCESS) return status; + } + + obj_map_t::iterator it = obj_map_->find((uint64_t)queue); + if (it != obj_map_->end()) { + const InterceptQueue* obj = it->second; + assert(queue == obj->queue_); + delete obj; + obj_map_->erase(it); + status = HSA_STATUS_SUCCESS; + } + + return status; + } + + static void OnSubmitCB(const void* in_packets, uint64_t count, uint64_t user_que_idx, void* data, + hsa_amd_queue_intercept_packet_writer writer) { + const packet_t* packets_arr = reinterpret_cast(in_packets); + InterceptQueue* obj = reinterpret_cast(data); + Queue* proxy = obj->proxy_; + + for (uint64_t j = 0; j < count; ++j) { + bool to_submit = true; + const packet_t* packet = &packets_arr[j]; + + if ((GetHeaderType(packet) == HSA_PACKET_TYPE_KERNEL_DISPATCH) && (dispatch_callback_ != NULL)) { + rocprofiler_group_t group = {}; + const hsa_kernel_dispatch_packet_t* dispatch_packet = + reinterpret_cast(packet); + const char* kernel_name = GetKernelName(dispatch_packet); + const rocprofiler_dispatch_record_t* record = NULL; + if (tracker_ != NULL) { + const auto* entry = tracker_->Add(obj->agent_info_->dev_id, dispatch_packet->completion_signal); + const_cast(dispatch_packet)->completion_signal = entry->signal; + record = entry->record; + } + rocprofiler_callback_data_t data = {obj->agent_info_->dev_id, + obj->agent_info_->dev_index, + obj->queue_, + user_que_idx, + dispatch_packet, + kernel_name, + record}; + hsa_status_t status = dispatch_callback_(&data, callback_data_, &group); + free(const_cast(kernel_name)); + if ((status == HSA_STATUS_SUCCESS) && (group.context != NULL)) { + Context* context = reinterpret_cast(group.context); + const pkt_vector_t& start_vector = context->StartPackets(group.index); + const pkt_vector_t& stop_vector = context->StopPackets(group.index); + + pkt_vector_t packets = start_vector; + packets.insert(packets.end(), *packet); + packets.insert(packets.end(), stop_vector.begin(), stop_vector.end()); + if (writer != NULL) { + writer(&packets[0], packets.size()); + } else { + proxy->Submit(&packets[0], packets.size()); + } + to_submit = false; + } + } + + if (to_submit) { + if (writer != NULL) { + writer(packet, 1); + } else { + proxy->Submit(packet, 1); + } + } + + packet += 1; + } + } + + static void SetCallbacks(rocprofiler_callback_t dispatch_callback, queue_callback_t destroy_callback, void* data) { + std::lock_guard lck(mutex_); + callback_data_ = data; + dispatch_callback_ = dispatch_callback; + destroy_callback_ = destroy_callback; + } + + static void SetTimeout(uint64_t timeout) { timeout_ = timeout; } + static void TrackerOn(bool on) { tracker_on_ = on; } + static bool IsTrackerOn() { return tracker_on_; } + + private: + InterceptQueue(const hsa_agent_t& agent, hsa_queue_t* const queue, ProxyQueue* proxy) : + queue_(queue), + proxy_(proxy) + { + agent_info_ = util::HsaRsrcFactory::Instance().GetAgentInfo(agent); + } + ~InterceptQueue() { ProxyQueue::Destroy(proxy_); } + + static packet_word_t GetHeaderType(const packet_t* packet) { + const packet_word_t* header = reinterpret_cast(packet); + return (*header >> HSA_PACKET_HEADER_TYPE) & header_type_mask; + } + + static const char* GetKernelName(const hsa_kernel_dispatch_packet_t* dispatch_packet) { + const amd_kernel_code_t* kernel_code = NULL; + hsa_status_t status = + util::HsaRsrcFactory::Instance().LoaderApi()->hsa_ven_amd_loader_query_host_address( + reinterpret_cast(dispatch_packet->kernel_object), + reinterpret_cast(&kernel_code)); + if (HSA_STATUS_SUCCESS != status) { + kernel_code = reinterpret_cast(dispatch_packet->kernel_object); + } + amd_runtime_loader_debug_info_t* dbg_info = reinterpret_cast( + kernel_code->runtime_loader_kernel_symbol); + const char* kernel_name = (dbg_info != NULL) ? dbg_info->kernel_name : NULL; + + // Kernel name is mangled name + // apply __cxa_demangle() to demangle it + const char* funcname = NULL; + if (kernel_name != NULL) { + size_t funcnamesize = 0; + int status; + const char* ret = abi::__cxa_demangle(kernel_name, NULL, &funcnamesize, &status); + funcname = (ret != 0) ? ret : strdup(kernel_name); + } + if (funcname == NULL) funcname = strdup(kernel_none_); + + return funcname; + } + + static mutex_t mutex_; + static const packet_word_t header_type_mask = (1ul << HSA_PACKET_HEADER_WIDTH_TYPE) - 1; + static rocprofiler_callback_t dispatch_callback_; + static queue_callback_t destroy_callback_; + static void* callback_data_; + static obj_map_t* obj_map_; + static const char* kernel_none_; + static uint64_t timeout_; + static Tracker* tracker_; + static bool tracker_on_; + + hsa_queue_t* const queue_; + ProxyQueue* const proxy_; + const util::AgentInfo* agent_info_; +}; + +} // namespace rocprofiler + +#endif // _SRC_CORE_INTERCEPT_QUEUE_H diff --git a/src/core/metrics.cpp b/src/core/metrics.cpp new file mode 100644 index 00000000..67598632 --- /dev/null +++ b/src/core/metrics.cpp @@ -0,0 +1,28 @@ +/****************************************************************************** +Copyright (c) 2018 Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*******************************************************************************/ + +#include "core/metrics.h" + +namespace rocprofiler { +MetricsDict::map_t* MetricsDict::map_ = NULL; +MetricsDict::mutex_t MetricsDict::mutex_; +} diff --git a/src/core/metrics.h b/src/core/metrics.h new file mode 100644 index 00000000..8f05a3e7 --- /dev/null +++ b/src/core/metrics.h @@ -0,0 +1,302 @@ +/****************************************************************************** +Copyright (c) 2018 Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*******************************************************************************/ + +#ifndef SRC_CORE_METRICS_H_ +#define SRC_CORE_METRICS_H_ + +#include +#include +#include + +#include +#include +#include +#include +#include + +#include "core/types.h" +#include "util/exception.h" +#include "util/hsa_rsrc_factory.h" +#include "xml/expr.h" +#include "xml/xml.h" + +namespace rocprofiler { +struct counter_t { + std::string name; + event_t event; +}; +typedef std::vector counters_vec_t; + +class Metric { + public: + Metric(const std::string& name) : name_(name) {} + virtual ~Metric() {} + std::string GetName() const { return name_; } + virtual void GetCounters(counters_vec_t& vec) const = 0; + counters_vec_t GetCounters() const { + counters_vec_t counters; + GetCounters(counters); + return counters; + } + virtual const xml::Expr* GetExpr() const = 0; + + private: + const std::string name_; +}; + +class BaseMetric : public Metric { + public: + BaseMetric(const std::string& name, const counter_t& counter) : Metric(name), counter_(counter) {} + void GetCounters(counters_vec_t& vec) const { vec.push_back(&counter_); } + const xml::Expr* GetExpr() const { return NULL; } + + private: + const counter_t counter_; +}; + +class ExprMetric : public Metric { + public: + ExprMetric(const std::string& name, const counters_vec_t& counters, const xml::Expr* expr) + : Metric(name), counters_(counters), expr_(expr) {} + ~ExprMetric() { delete expr_; } + void GetCounters(counters_vec_t& vec) const { + vec.insert(vec.end(), counters_.begin(), counters_.end()); + } + const xml::Expr* GetExpr() const { return expr_; } + + private: + const counters_vec_t counters_; + const xml::Expr* expr_; +}; + +class MetricsDict { + public: + typedef std::map cache_t; + typedef cache_t::const_iterator const_iterator_t; + typedef std::map map_t; + typedef std::mutex mutex_t; + + class ExprCache : public xml::expr_cache_t { + public: + ExprCache(const cache_t* cache) : cache_(cache) {} + bool Lookup(const std::string& name, std::string& result) const { + bool ret = false; + auto it = cache_->find(name); + if (it != cache_->end()) { + ret = true; + const rocprofiler::ExprMetric* expr_metric = + dynamic_cast(it->second); + if (expr_metric) result = expr_metric->GetExpr()->GetStr(); + } + return ret; + } + + private: + const cache_t* const cache_; + }; + + static MetricsDict* Create(const util::AgentInfo* agent_info) { + std::lock_guard lck(mutex_); + if (map_ == NULL) map_ = new map_t; + auto ret = map_->insert({agent_info->gfxip, NULL}); + if (ret.second) ret.first->second = new MetricsDict(agent_info); + return ret.first->second; + } + + static void Destroy() { + if (map_ != NULL) { + for (auto& entry : *map_) delete entry.second; + delete map_; + map_ = NULL; + } + } + + const Metric* Get(const std::string& name) const { + const Metric* metric = NULL; + + auto it = cache_.find(name); + if (it != cache_.end()) metric = it->second; + else { + const std::size_t pos = name.find(':'); + if (pos != std::string::npos) { + std::string block_name = name.substr(0, pos); + const std::string event_str = name.substr(pos + 1); + + uint32_t block_index = 0; + bool indexed = false; + const std::size_t pos1 = block_name.find('['); + if (pos1 != std::string::npos) { + const std::size_t pos2 = block_name.find(']'); + if (pos2 == std::string::npos) EXC_RAISING(HSA_STATUS_ERROR, "Malformed metric name '" << name << "'"); + block_name = name.substr(0, pos1); + const std::string block_index_str = name.substr(pos1 + 1, pos2 - (pos1 + 1)); + block_index = atol(block_index_str.c_str()); + indexed = true; + } + + const hsa_ven_amd_aqlprofile_id_query_t query = Translate(agent_info_, block_name); + const hsa_ven_amd_aqlprofile_block_name_t block_id = (hsa_ven_amd_aqlprofile_block_name_t)query.id; + if ((query.instance_count > 1) && (indexed == false)) EXC_RAISING(HSA_STATUS_ERROR, "Malformed indexed metric name '" << name << "'"); + const uint32_t event_id = atol(event_str.c_str()); + const counter_t counter = {name, {block_id, block_index, event_id}}; + metric = new BaseMetric(name, counter); + } + } + + return metric; + } + + uint32_t Size() const { return cache_.size(); } + const_iterator_t Begin() const { return cache_.begin(); } + const_iterator_t End() const { return cache_.end(); } + + xml::Xml::nodes_t GetNodes(const std::string& scope) const { + return xml_->GetNodes("top." + scope + ".metric"); + } + + private: + MetricsDict(const util::AgentInfo* agent_info) : xml_(NULL), agent_info_(agent_info) { + const char* xml_name = getenv("ROCP_METRICS"); + if (xml_name != NULL) { + xml_ = xml::Xml::Create(xml_name); + if (xml_ == NULL) EXC_RAISING(HSA_STATUS_ERROR, "metrics .xml open error '" << xml_name << "'"); + xml_->AddConst("top.const.metric", "MAX_WAVE_SIZE", agent_info->max_wave_size); + xml_->AddConst("top.const.metric", "CU_NUM", agent_info->cu_num); + xml_->AddConst("top.const.metric", "SIMD_NUM", agent_info->simds_per_cu * agent_info->cu_num); + xml_->AddConst("top.const.metric", "SE_NUM", agent_info->se_num); + ImportMetrics(agent_info, "const"); + ImportMetrics(agent_info, agent_info->gfxip); + ImportMetrics(agent_info, "global"); + } + } + + ~MetricsDict() { + xml::Xml::Destroy(xml_); + for (auto& entry : cache_) delete entry.second; + } + + static hsa_ven_amd_aqlprofile_id_query_t Translate(const util::AgentInfo* agent_info, const std::string& block_name) { + hsa_ven_amd_aqlprofile_profile_t profile; + profile.agent = agent_info->dev_id; + hsa_ven_amd_aqlprofile_id_query_t query = {block_name.c_str(), 0, 0}; + hsa_status_t status = + util::HsaRsrcFactory::Instance().AqlProfileApi()->hsa_ven_amd_aqlprofile_get_info( + &profile, HSA_VEN_AMD_AQLPROFILE_INFO_BLOCK_ID, &query); + if (status != HSA_STATUS_SUCCESS) AQL_EXC_RAISING(HSA_STATUS_ERROR, "ImportMetrics: bad block name '" << block_name << "'"); + return query; + } + + void ImportMetrics(const util::AgentInfo* agent_info, const std::string& scope) { + auto metrics_list = xml_->GetNodes("top." + scope + ".metric"); + if (!metrics_list.empty()) { + for (auto node : metrics_list) { + const std::string name = node->opts["name"]; + const std::string expr_str = node->opts["expr"]; + std::string descr = node->opts["descr"]; + if (descr.empty()) descr = (expr_str.empty()) ? name : expr_str; + + if (expr_str.empty()) { + const std::string block_name = node->opts["block"]; + const std::string event_str = node->opts["event"]; + const uint32_t event_id = atol(event_str.c_str()); + + const hsa_ven_amd_aqlprofile_id_query_t query = Translate(agent_info, block_name); + const hsa_ven_amd_aqlprofile_block_name_t block_id = (hsa_ven_amd_aqlprofile_block_name_t)query.id; + if (query.instance_count > 1) { + for (unsigned block_index = 0; block_index < query.instance_count; ++block_index) { + std::ostringstream full_name; + full_name << name << '[' << block_index << ']'; + std::ostringstream block_insance; + block_insance << block_name << "[" << block_index << "]"; + std::ostringstream alias; + alias << block_insance.str() << ":" << event_str; + const counter_t counter = {full_name.str(), {block_id, block_index, event_id}}; + AddMetric(full_name.str(), alias.str(), counter); + } + } else { + const std::string alias = block_name + ":" + event_str; + const counter_t counter = {name, {block_id, 0, event_id}}; + AddMetric(name, alias, counter); + } + } else { + xml::Expr* expr_obj = new xml::Expr(expr_str, new ExprCache(&cache_)); +#if 0 + std::cout << "# " << descr << std::endl; + std::cout << name << "=" << expr_obj->String() << "\n" << std::endl; +#endif + counters_vec_t counters_vec; + for (const std::string var : expr_obj->GetVars()) { + auto it = cache_.find(var); + if (it == cache_.end()) + EXC_RAISING(HSA_STATUS_ERROR, "Bad metric '" << name << "', var '" << var + << "' is not found"); + it->second->GetCounters(counters_vec); + } + AddMetric(name, counters_vec, expr_obj); + } + } + } + } + + const Metric* AddMetric(const std::string& name, const std::string& /*alias*/, const counter_t& counter) { + const Metric* metric = NULL; + const auto ret = cache_.insert({name, NULL}); + if (ret.second) { + metric = new BaseMetric(name, counter); + ret.first->second = metric; + } else EXC_RAISING(HSA_STATUS_ERROR, "metric redefined '" << name << "'"); + return metric; + } + + const Metric* AddMetric(const std::string& name, const counters_vec_t& counters_vec, const xml::Expr* expr_obj) { + const Metric* metric = NULL; + const auto ret = cache_.insert({name, NULL}); + if (ret.second) { + metric = new ExprMetric(name, counters_vec, expr_obj); + ret.first->second = metric; + } else EXC_RAISING(HSA_STATUS_ERROR, "expr-metric redefined '" << name << "'"); + return metric; + } + + void Print() { + for (auto& v : cache_) { + const Metric* metric = v.second; + counters_vec_t counters_vec; + printf("> Metric '%s'\n", metric->GetName().c_str()); + metric->GetCounters(counters_vec); + for (auto c : counters_vec) { + printf(" counter %s, b(%u), i (%u), e (%u)\n", c->name.c_str(), c->event.block_name, c->event.block_index, c->event.counter_id); + } + } + } + + xml::Xml* xml_; + const util::AgentInfo* agent_info_; + cache_t cache_; + + static map_t* map_; + static mutex_t mutex_; +}; + +} // namespace rocprofiler + +#endif // SRC_CORE_METRICS_H_ diff --git a/src/core/profile.h b/src/core/profile.h new file mode 100644 index 00000000..43d30a21 --- /dev/null +++ b/src/core/profile.h @@ -0,0 +1,271 @@ +/****************************************************************************** +Copyright (c) 2018 Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*******************************************************************************/ + +#ifndef SRC_CORE_PROFILE_H_ +#define SRC_CORE_PROFILE_H_ + +#include "inc/rocprofiler.h" + +#include +#include + +#include "core/types.h" +#include "util/exception.h" +#include "util/hsa_rsrc_factory.h" + +namespace rocprofiler { +struct profile_info_t { + const event_t* event; + const parameter_t* parameters; + uint32_t parameter_count; + rocprofiler_feature_t* rinfo; +}; +typedef std::vector info_vector_t; +typedef std::vector pkt_vector_t; +struct profile_tuple_t { + const profile_t* profile; + info_vector_t* info_vector; + hsa_signal_t completion_signal; +}; +typedef std::vector profile_vector_t; + +template class ConfigBase {}; + +template <> class ConfigBase { + public: + ConfigBase(profile_t* profile) : profile_(profile) {} + + protected: + void* Array() { return const_cast(profile_->events); } + unsigned Count() const { return profile_->event_count; } + void Set(event_t* events, const unsigned& count) { + profile_->events = events; + profile_->event_count = count; + } + profile_t* profile_; +}; + +template <> class ConfigBase { + public: + ConfigBase(profile_t* profile) : profile_(profile) {} + + protected: + void* Array() { return const_cast(profile_->parameters); } + unsigned Count() const { return profile_->parameter_count; } + void Set(parameter_t* parameters, const unsigned& count) { + profile_->parameters = parameters; + profile_->parameter_count = count; + } + profile_t* profile_; +}; + +template class Config : protected ConfigBase { + typedef ConfigBase Parent; + + public: + Config(profile_t* profile) : Parent(profile) {} + void Insert(const Item& item) { + auto count = Parent::Count(); + count += 1; + Item* array = + reinterpret_cast(realloc(const_cast(Parent::Array()), count * sizeof(Item))); + array[count - 1] = item; + Parent::Set(array, count); + } +}; + +class Profile { + public: + static const uint32_t LEGACY_SLOT_SIZE_PKT = + HSA_VEN_AMD_AQLPROFILE_LEGACY_PM4_PACKET_SIZE / sizeof(packet_t); + + Profile(const util::AgentInfo* agent_info) : agent_info_(agent_info) { + profile_ = {}; + profile_.agent = agent_info->dev_id; + completion_signal_ = {}; + is_legacy_ = (strncmp(agent_info->name, "gfx8", 4) == 0); + } + + virtual ~Profile() { + info_vector_.clear(); + if (profile_.command_buffer.ptr) util::HsaRsrcFactory::FreeMemory(profile_.command_buffer.ptr); + if (profile_.output_buffer.ptr) util::HsaRsrcFactory::FreeMemory(profile_.output_buffer.ptr); + if (profile_.events) free(const_cast(profile_.events)); + if (profile_.parameters) free(const_cast(profile_.parameters)); + if (completion_signal_.handle) { + hsa_status_t status = hsa_signal_destroy(completion_signal_); + if (status != HSA_STATUS_SUCCESS) EXC_ABORT(status, "signal_destroy " << std::hex << status); + } + } + + virtual void Insert(const profile_info_t& info) { info_vector_.push_back(info.rinfo); } + + hsa_status_t Finalize(pkt_vector_t& start_vector, pkt_vector_t& stop_vector, pkt_vector_t& read_vector) { + hsa_status_t status = HSA_STATUS_SUCCESS; + + if (!info_vector_.empty()) { + util::HsaRsrcFactory* rsrc = &util::HsaRsrcFactory::Instance(); + const pfn_t* api = rsrc->AqlProfileApi(); + packet_t start{}; + packet_t stop{}; + packet_t read{}; + + // Check the profile buffer sizes + status = api->hsa_ven_amd_aqlprofile_start(&profile_, NULL); + if (status != HSA_STATUS_SUCCESS) AQL_EXC_RAISING(status, "aqlprofile_start(NULL)"); + status = Allocate(rsrc); + if (status != HSA_STATUS_SUCCESS) AQL_EXC_RAISING(status, "Allocate()"); + + // Generate start/stop/read profiling packets + status = api->hsa_ven_amd_aqlprofile_start(&profile_, &start); + if (status != HSA_STATUS_SUCCESS) AQL_EXC_RAISING(status, "aqlprofile_start"); + status = api->hsa_ven_amd_aqlprofile_stop(&profile_, &stop); + if (status != HSA_STATUS_SUCCESS) AQL_EXC_RAISING(status, "aqlprofile_stop"); + hsa_status_t rd_status = api->hsa_ven_amd_aqlprofile_read(&profile_, &read); +#if 0 // Read API returns error if disabled + if (rd_status != HSA_STATUS_SUCCESS) AQL_EXC_RAISING(status, "aqlprofile_read"); +#endif + + // Set completion signal + hsa_signal_t dummy_signal{}; + dummy_signal.handle = 0; + start.completion_signal = dummy_signal; + hsa_signal_t post_signal; + status = hsa_signal_create(1, 0, NULL, &post_signal); + if (status != HSA_STATUS_SUCCESS) EXC_RAISING(status, "signal_create " << std::hex << status); + stop.completion_signal = post_signal; + read.completion_signal = post_signal; + completion_signal_ = post_signal; + + // Fill packet vectors + if (is_legacy_) { + const uint32_t start_index = start_vector.size(); + const uint32_t stop_index = stop_vector.size(); + + start_vector.insert(start_vector.end(), LEGACY_SLOT_SIZE_PKT, packet_t{}); + stop_vector.insert(stop_vector.end(), LEGACY_SLOT_SIZE_PKT, packet_t{}); + + status = api->hsa_ven_amd_aqlprofile_legacy_get_pm4( + &start, reinterpret_cast(&start_vector[start_index])); + if (status != HSA_STATUS_SUCCESS) + AQL_EXC_RAISING(status, "hsa_ven_amd_aqlprofile_legacy_get_pm4"); + + status = api->hsa_ven_amd_aqlprofile_legacy_get_pm4( + &stop, reinterpret_cast(&stop_vector[stop_index])); + if (status != HSA_STATUS_SUCCESS) + AQL_EXC_RAISING(status, "hsa_ven_amd_aqlprofile_legacy_get_pm4"); + + if (rd_status == HSA_STATUS_SUCCESS) { + const uint32_t read_index = read_vector.size(); + read_vector.insert(read_vector.end(), LEGACY_SLOT_SIZE_PKT, packet_t{}); + status = api->hsa_ven_amd_aqlprofile_legacy_get_pm4( + &read, reinterpret_cast(&read_vector[read_index])); + if (status != HSA_STATUS_SUCCESS) + AQL_EXC_RAISING(status, "hsa_ven_amd_aqlprofile_legacy_get_pm4"); + } + } else { + start_vector.push_back(start); + stop_vector.push_back(stop); + if (rd_status == HSA_STATUS_SUCCESS) { + read_vector.push_back(read); + } + } + } + + return status; + } + + void GetProfiles(profile_vector_t& vec) { + if (!info_vector_.empty()) { + vec.push_back(profile_tuple_t{&profile_, &info_vector_, completion_signal_}); + } + } + + bool Empty() const { return info_vector_.empty(); } + + protected: + virtual hsa_status_t Allocate(util::HsaRsrcFactory* rsrc) = 0; + + const util::AgentInfo* const agent_info_; + bool is_legacy_; + profile_t profile_; + info_vector_t info_vector_; + hsa_signal_t completion_signal_; +}; + +class PmcProfile : public Profile { + public: + PmcProfile(const util::AgentInfo* agent_info) : Profile(agent_info) { + profile_.type = HSA_VEN_AMD_AQLPROFILE_EVENT_TYPE_PMC; + } + + void Insert(const profile_info_t& info) { + Profile::Insert(info); + Config(&profile_).Insert(*(info.event)); + } + + hsa_status_t Allocate(util::HsaRsrcFactory* rsrc) { + profile_.command_buffer.ptr = + rsrc->AllocateSysMemory(agent_info_, profile_.command_buffer.size); + profile_.output_buffer.ptr = rsrc->AllocateSysMemory(agent_info_, profile_.output_buffer.size); + return (profile_.command_buffer.ptr && profile_.output_buffer.ptr) ? HSA_STATUS_SUCCESS + : HSA_STATUS_ERROR; + } +}; + +class SqttProfile : public Profile { + public: + static inline void SetSize(const uint32_t& size) { output_buffer_size_ = size; } + static inline uint32_t GetSize() { return output_buffer_size_; } + static inline void SetLocal(const bool& b) { output_buffer_local_ = b; } + static inline bool IsLocal() { return output_buffer_local_; } + + SqttProfile(const util::AgentInfo* agent_info) : Profile(agent_info) { + profile_.type = HSA_VEN_AMD_AQLPROFILE_EVENT_TYPE_SQTT; + } + + void Insert(const profile_info_t& info) { + Profile::Insert(info); + for (unsigned j = 0; j < info.parameter_count; ++j) { + Config(&profile_).Insert(info.parameters[j]); + } + } + + hsa_status_t Allocate(util::HsaRsrcFactory* rsrc) { + profile_.command_buffer.ptr = + rsrc->AllocateSysMemory(agent_info_, profile_.command_buffer.size); + profile_.output_buffer.size = output_buffer_size_; + profile_.output_buffer.ptr = (output_buffer_local_) ? + rsrc->AllocateLocalMemory(agent_info_, profile_.output_buffer.size) : + rsrc->AllocateSysMemory(agent_info_, profile_.output_buffer.size); + return (profile_.command_buffer.ptr && profile_.output_buffer.ptr) ? HSA_STATUS_SUCCESS + : HSA_STATUS_ERROR; + } + + private: + static uint32_t output_buffer_size_; + static bool output_buffer_local_; +}; + +} // namespace rocprofiler + +#endif // SRC_CORE_PROFILE_H_ diff --git a/src/core/proxy_queue.cpp b/src/core/proxy_queue.cpp new file mode 100644 index 00000000..7a4f4476 --- /dev/null +++ b/src/core/proxy_queue.cpp @@ -0,0 +1,63 @@ +/****************************************************************************** +Copyright (c) 2018 Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*******************************************************************************/ + +#include "core/proxy_queue.h" + +#include "core/hsa_proxy_queue.h" +#include "core/simple_proxy_queue.h" + +namespace rocprofiler { +void ProxyQueue::HsaIntercept(HsaApiTable* table) { + if (rocp_type_) SimpleProxyQueue::HsaIntercept(table); +} + +ProxyQueue* ProxyQueue::Create(hsa_agent_t agent, uint32_t size, hsa_queue_type32_t type, + void (*callback)(hsa_status_t status, hsa_queue_t* source, + void* data), + void* data, uint32_t private_segment_size, + uint32_t group_segment_size, hsa_queue_t** queue, + hsa_status_t* status) { + hsa_status_t suc = HSA_STATUS_ERROR; + ProxyQueue* instance = + (rocp_type_) ? (ProxyQueue*) new SimpleProxyQueue() : (ProxyQueue*) new HsaProxyQueue(); + if (instance != NULL) { + suc = instance->Init(agent, size, type, callback, data, private_segment_size, + group_segment_size, queue); + if (suc != HSA_STATUS_SUCCESS) { + delete instance; + instance = NULL; + } + } + *status = suc; + assert(*status == HSA_STATUS_SUCCESS); + return instance; +} + +hsa_status_t ProxyQueue::Destroy(const ProxyQueue* obj) { + assert(obj != NULL); + auto suc = obj->Cleanup(); + delete obj; + return suc; +} + +bool ProxyQueue::rocp_type_ = false; +} // namespace rocprofiler diff --git a/src/core/proxy_queue.h b/src/core/proxy_queue.h new file mode 100644 index 00000000..42e6c63b --- /dev/null +++ b/src/core/proxy_queue.h @@ -0,0 +1,77 @@ +/****************************************************************************** +Copyright (c) 2018 Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*******************************************************************************/ + +#ifndef _SRC_CORE_PROXY_QUEUE_H +#define _SRC_CORE_PROXY_QUEUE_H + +#include +#include +#include +#include +#include + +#include "core/queue.h" +#include "core/types.h" + +struct HsaApiTable; + +namespace rocprofiler { +typedef void (*hsa_amd_queue_intercept_packet_writer)(const void* packets, uint64_t count); +typedef void (*on_submit_cb_t)(const void* packet, uint64_t count, uint64_t que_idx, void* data, + hsa_amd_queue_intercept_packet_writer writer); + +class ProxyQueue : public Queue { + public: + static void InitFactory() { + const char* type = getenv("ROCP_PROXY_QUEUE"); + if (type != NULL) { + if (strncmp(type, "rocp", 4) == 0) rocp_type_ = true; + } + } + + static void HsaIntercept(HsaApiTable* table); + + static ProxyQueue* Create(hsa_agent_t agent, uint32_t size, hsa_queue_type32_t type, + void (*callback)(hsa_status_t status, hsa_queue_t* source, void* data), + void* data, uint32_t private_segment_size, uint32_t group_segment_size, + hsa_queue_t** queue, hsa_status_t* status); + + static hsa_status_t Destroy(const ProxyQueue* obj); + + virtual hsa_status_t Init(hsa_agent_t agent, uint32_t size, hsa_queue_type32_t type, + void (*callback)(hsa_status_t status, hsa_queue_t* source, void* data), + void* data, uint32_t private_segment_size, uint32_t group_segment_size, + hsa_queue_t** queue) = 0; + virtual hsa_status_t Cleanup() const = 0; + virtual hsa_status_t SetInterceptCB(on_submit_cb_t on_submit_cb, void* data) = 0; + virtual void Submit(const packet_t* packet) = 0; + + protected: + virtual ~ProxyQueue(){}; + + private: + static bool rocp_type_; +}; + +} // namespace rocprofiler + +#endif // _SRC_CORE_PROXY_QUEUE_H diff --git a/src/core/queue.h b/src/core/queue.h new file mode 100644 index 00000000..07e3b45b --- /dev/null +++ b/src/core/queue.h @@ -0,0 +1,42 @@ +/****************************************************************************** +Copyright (c) 2018 Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*******************************************************************************/ + +#ifndef _SRC_CORE_QUEUE_H +#define _SRC_CORE_QUEUE_H + +#include "core/types.h" + +namespace rocprofiler { + +class Queue { + public: + Queue() {} + virtual ~Queue() {} + virtual void Submit(const packet_t* packet) = 0; + virtual void Submit(const packet_t* packet, const size_t& count) { + for (const packet_t* p = packet; p < packet + count; ++p) Submit(p); + } +}; + +} // namespace rocprofiler + +#endif // _SRC_CORE_QUEUE_H diff --git a/src/core/rocprofiler.cpp b/src/core/rocprofiler.cpp new file mode 100644 index 00000000..a96fadba --- /dev/null +++ b/src/core/rocprofiler.cpp @@ -0,0 +1,522 @@ +/****************************************************************************** +Copyright (c) 2018 Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*******************************************************************************/ + +#include "inc/rocprofiler.h" + +#include +#include +#include +#include + +#include "core/context.h" +#include "core/hsa_queue.h" +#include "core/intercept_queue.h" +#include "core/proxy_queue.h" +#include "core/simple_proxy_queue.h" +#include "util/exception.h" +#include "util/hsa_rsrc_factory.h" +#include "util/logger.h" + +#define PUBLIC_API __attribute__((visibility("default"))) +#define CONSTRUCTOR_API __attribute__((constructor)) +#define DESTRUCTOR_API __attribute__((destructor)) + +#define API_METHOD_PREFIX \ + hsa_status_t status = HSA_STATUS_SUCCESS; \ + try { + +#define API_METHOD_SUFFIX \ + } \ + catch (std::exception & e) { \ + ERR_LOGGING(__FUNCTION__ << "(), " << e.what()); \ + status = rocprofiler::GetExcStatus(e); \ + } \ + return status; + +/////////////////////////////////////////////////////////////////////////////////////////////////// +// Internal library methods +// +namespace rocprofiler { +decltype(hsa_queue_create)* hsa_queue_create_fn; +decltype(hsa_queue_destroy)* hsa_queue_destroy_fn; + +decltype(hsa_signal_store_relaxed)* hsa_signal_store_relaxed_fn; +decltype(hsa_signal_store_relaxed)* hsa_signal_store_screlease_fn; + +decltype(hsa_queue_load_write_index_relaxed)* hsa_queue_load_write_index_relaxed_fn; +decltype(hsa_queue_store_write_index_relaxed)* hsa_queue_store_write_index_relaxed_fn; +decltype(hsa_queue_load_read_index_relaxed)* hsa_queue_load_read_index_relaxed_fn; + +decltype(hsa_queue_load_write_index_scacquire)* hsa_queue_load_write_index_scacquire_fn; +decltype(hsa_queue_store_write_index_screlease)* hsa_queue_store_write_index_screlease_fn; +decltype(hsa_queue_load_read_index_scacquire)* hsa_queue_load_read_index_scacquire_fn; + +decltype(hsa_amd_queue_intercept_create)* hsa_amd_queue_intercept_create_fn; +decltype(hsa_amd_queue_intercept_register)* hsa_amd_queue_intercept_register_fn; + +::HsaApiTable* kHsaApiTable; + +void SaveHsaApi(::HsaApiTable* table) { + kHsaApiTable = table; + hsa_queue_create_fn = table->core_->hsa_queue_create_fn; + hsa_queue_destroy_fn = table->core_->hsa_queue_destroy_fn; + + hsa_signal_store_relaxed_fn = table->core_->hsa_signal_store_relaxed_fn; + hsa_signal_store_screlease_fn = table->core_->hsa_signal_store_screlease_fn; + + hsa_queue_load_write_index_relaxed_fn = table->core_->hsa_queue_load_write_index_relaxed_fn; + hsa_queue_store_write_index_relaxed_fn = table->core_->hsa_queue_store_write_index_relaxed_fn; + hsa_queue_load_read_index_relaxed_fn = table->core_->hsa_queue_load_read_index_relaxed_fn; + + hsa_queue_load_write_index_scacquire_fn = table->core_->hsa_queue_load_write_index_scacquire_fn; + hsa_queue_store_write_index_screlease_fn = table->core_->hsa_queue_store_write_index_screlease_fn; + hsa_queue_load_read_index_scacquire_fn = table->core_->hsa_queue_load_read_index_scacquire_fn; + + hsa_amd_queue_intercept_create_fn = table->amd_ext_->hsa_amd_queue_intercept_create_fn; + hsa_amd_queue_intercept_register_fn = table->amd_ext_->hsa_amd_queue_intercept_register_fn; +} + +void RestoreHsaApi() { + ::HsaApiTable* table = kHsaApiTable; + table->core_->hsa_queue_create_fn = hsa_queue_create_fn; + table->core_->hsa_queue_destroy_fn = hsa_queue_destroy_fn; + + table->core_->hsa_signal_store_relaxed_fn = hsa_signal_store_relaxed_fn; + table->core_->hsa_signal_store_screlease_fn = hsa_signal_store_screlease_fn; + + table->core_->hsa_queue_load_write_index_relaxed_fn = hsa_queue_load_write_index_relaxed_fn; + table->core_->hsa_queue_store_write_index_relaxed_fn = hsa_queue_store_write_index_relaxed_fn; + table->core_->hsa_queue_load_read_index_relaxed_fn = hsa_queue_load_read_index_relaxed_fn; + + table->core_->hsa_queue_load_write_index_scacquire_fn = hsa_queue_load_write_index_scacquire_fn; + table->core_->hsa_queue_store_write_index_screlease_fn = hsa_queue_store_write_index_screlease_fn; + table->core_->hsa_queue_load_read_index_scacquire_fn = hsa_queue_load_read_index_scacquire_fn; + + table->amd_ext_->hsa_amd_queue_intercept_create_fn = hsa_amd_queue_intercept_create_fn; + table->amd_ext_->hsa_amd_queue_intercept_register_fn = hsa_amd_queue_intercept_register_fn; +} + +typedef void (*tool_handler_t)(); +typedef void (*tool_handler_prop_t)(rocprofiler_settings_t*); +void * tool_handle = NULL; + +// Load profiling tool library +// Return true if intercepting mode is enabled +bool LoadTool() { + bool intercept_mode = false; + const char* tool_lib = getenv("ROCP_TOOL_LIB"); + + if (tool_lib) { + intercept_mode = true; + + tool_handle = dlopen(tool_lib, RTLD_NOW); + if (tool_handle == NULL) { + fprintf(stderr, "ROCProfiler: can't load tool library \"%s\"\n", tool_lib); + fprintf(stderr, "%s\n", dlerror()); + abort(); + } + tool_handler_t handler = reinterpret_cast(dlsym(tool_handle, "OnLoadTool")); + tool_handler_prop_t handler_prop = reinterpret_cast(dlsym(tool_handle, "OnLoadToolProp")); + if ((handler == NULL) && (handler_prop == NULL)) { + fprintf(stderr, "ROCProfiler: tool library corrupted, OnLoadTool()/OnLoadToolProp() method is expected\n"); + fprintf(stderr, "%s\n", dlerror()); + abort(); + } + tool_handler_t on_unload_handler = reinterpret_cast(dlsym(tool_handle, "OnUnloadTool")); + if (on_unload_handler == NULL) { + fprintf(stderr, "ROCProfiler: tool library corrupted, OnUnloadTool() method is expected\n"); + fprintf(stderr, "%s\n", dlerror()); + abort(); + } + + rocprofiler_settings_t settings{}; + settings.intercept_mode = (intercept_mode) ? 1 : 0; + settings.sqtt_size = SqttProfile::GetSize(); + settings.sqtt_local = SqttProfile::IsLocal() ? 1: 0; + settings.timeout = Context::GetTimeout(); + settings.timestamp_on = InterceptQueue::IsTrackerOn() ? 1 : 0; + + if (handler) handler(); + else if (handler_prop) handler_prop(&settings); + + intercept_mode = (settings.intercept_mode != 0); + SqttProfile::SetSize(settings.sqtt_size); + SqttProfile::SetLocal(settings.sqtt_local != 0); + Context::SetTimeout(settings.timeout); + InterceptQueue::SetTimeout(settings.timeout); + InterceptQueue::TrackerOn(settings.timestamp_on != 0); + } + + return intercept_mode; +} + +// Unload profiling tool librray +void UnloadTool() { + if (tool_handle) { + tool_handler_t handler = reinterpret_cast(dlsym(tool_handle, "OnUnloadTool")); + if (handler == NULL) { + fprintf(stderr, "ROCProfiler error: tool library corrupted, OnUnloadTool() method is expected\n"); + fprintf(stderr, "%s\n", dlerror()); + abort(); + } + handler(); + dlclose(tool_handle); + } +} + +CONSTRUCTOR_API void constructor() { + util::Logger::Create(); +} + +DESTRUCTOR_API void destructor() { + util::HsaRsrcFactory::Destroy(); + rocprofiler::MetricsDict::Destroy(); + util::Logger::Destroy(); +} + +const MetricsDict* GetMetrics(const hsa_agent_t& agent) { + rocprofiler::util::HsaRsrcFactory* hsa_rsrc = &rocprofiler::util::HsaRsrcFactory::Instance(); + const rocprofiler::util::AgentInfo* agent_info = hsa_rsrc->GetAgentInfo(agent); + if (agent_info == NULL) { + EXC_RAISING(HSA_STATUS_ERROR, "agent is not found"); + } + const MetricsDict* metrics = MetricsDict::Create(agent_info); + if (metrics == NULL) EXC_RAISING(HSA_STATUS_ERROR, "MetricsDict create failed"); + return metrics; +} + +hsa_status_t GetExcStatus(const std::exception& e) { + const util::exception* rocprofiler_exc_ptr = dynamic_cast(&e); + return (rocprofiler_exc_ptr) ? static_cast(rocprofiler_exc_ptr->status()) + : HSA_STATUS_ERROR; +} + +rocprofiler_properties_t rocprofiler_properties; +uint64_t Context::timeout_ = UINT64_MAX; +uint32_t SqttProfile::output_buffer_size_ = 0x2000000; // 32M +bool SqttProfile::output_buffer_local_ = true; +Tracker::mutex_t Tracker::mutex_; +util::Logger::mutex_t util::Logger::mutex_; +util::Logger* util::Logger::instance_ = NULL; +} + +/////////////////////////////////////////////////////////////////////////////////////////////////// +// Public library methods +// +extern "C" { + +// HSA-runtime tool on-load method +PUBLIC_API bool OnLoad(HsaApiTable* table, uint64_t runtime_version, uint64_t failed_tool_count, + const char* const* failed_tool_names) { + rocprofiler::SaveHsaApi(table); + rocprofiler::ProxyQueue::InitFactory(); + bool intercept_mode = false; + const char* intercept_env = getenv("ROCP_HSA_INTERCEPT"); + if (intercept_env != NULL) { + if (strncmp(intercept_env, "1", 1) == 0) intercept_mode = true; + } + if (rocprofiler::LoadTool()) intercept_mode = true; + // HSA intercepting + if (intercept_mode) { + rocprofiler::ProxyQueue::HsaIntercept(table); + rocprofiler::InterceptQueue::HsaIntercept(table); + } + return true; +} + +// HSA-runtime tool on-unload method +PUBLIC_API void OnUnload() { + rocprofiler::UnloadTool(); + rocprofiler::RestoreHsaApi(); +} + +// Returns library vesrion +PUBLIC_API uint32_t rocprofiler_version_major() { return ROCPROFILER_VERSION_MAJOR; } +PUBLIC_API uint32_t rocprofiler_version_minor() { return ROCPROFILER_VERSION_MINOR; } + +// Returns the last error message +PUBLIC_API hsa_status_t rocprofiler_error_string(const char** str) { + API_METHOD_PREFIX + *str = rocprofiler::util::Logger::LastMessage().c_str(); + API_METHOD_SUFFIX +} + +// Create new profiling context +PUBLIC_API hsa_status_t rocprofiler_open(hsa_agent_t agent, rocprofiler_feature_t* features, + uint32_t feature_count, rocprofiler_t** handle, uint32_t mode, + rocprofiler_properties_t* properties) { + API_METHOD_PREFIX + rocprofiler::util::HsaRsrcFactory* hsa_rsrc = &rocprofiler::util::HsaRsrcFactory::Instance(); + const rocprofiler::util::AgentInfo* agent_info = hsa_rsrc->GetAgentInfo(agent); + if (agent_info == NULL) { + EXC_RAISING(HSA_STATUS_ERROR, "agent is not found"); + } + + rocprofiler::Queue* queue = NULL; + if (mode != 0) { + if (mode & ROCPROFILER_MODE_STANDALONE) { + if (mode & ROCPROFILER_MODE_CREATEQUEUE) { + if (hsa_rsrc->CreateQueue(agent_info, properties->queue_depth, &(properties->queue)) == + false) { + EXC_RAISING(HSA_STATUS_ERROR, "CreateQueue() failed"); + } + } + queue = new rocprofiler::HsaQueue(agent_info, properties->queue); + } else { + EXC_RAISING(HSA_STATUS_ERROR, "invalid mode (" << mode << ")"); + } + } + + *handle = new rocprofiler::Context(agent_info, queue, features, feature_count, properties->handler, + properties->handler_arg); + API_METHOD_SUFFIX +} + +// Delete profiling info +PUBLIC_API hsa_status_t rocprofiler_close(rocprofiler_t* handle) { + API_METHOD_PREFIX + rocprofiler::Context* context = reinterpret_cast(handle); + if (context) delete context; + API_METHOD_SUFFIX +} + +// Reset context +PUBLIC_API hsa_status_t rocprofiler_reset(rocprofiler_t* handle, uint32_t group_index) { + API_METHOD_PREFIX + rocprofiler::Context* context = reinterpret_cast(handle); + context->Reset(group_index); + API_METHOD_SUFFIX +} + +// Get profiling group count +PUBLIC_API hsa_status_t rocprofiler_group_count(const rocprofiler_t* handle, + uint32_t* group_count) { + API_METHOD_PREFIX + const rocprofiler::Context* context = reinterpret_cast(handle); + *group_count = context->GetGroupCount(); + API_METHOD_SUFFIX +} + +// Get profiling group for a given group index +PUBLIC_API hsa_status_t rocprofiler_get_group(rocprofiler_t* handle, uint32_t group_index, + rocprofiler_group_t* group) { + API_METHOD_PREFIX + rocprofiler::Context* context = reinterpret_cast(handle); + *group = context->GetGroupInfo(group_index); + API_METHOD_SUFFIX +} + +// Start profiling +PUBLIC_API hsa_status_t rocprofiler_start(rocprofiler_t* handle, uint32_t group_index) { + API_METHOD_PREFIX + rocprofiler::Context* context = reinterpret_cast(handle); + context->Start(group_index); + API_METHOD_SUFFIX +} + +// Stop profiling +PUBLIC_API hsa_status_t rocprofiler_stop(rocprofiler_t* handle, uint32_t group_index) { + API_METHOD_PREFIX + rocprofiler::Context* context = reinterpret_cast(handle); + context->Stop(group_index); + API_METHOD_SUFFIX +} + +// Read profiling +PUBLIC_API hsa_status_t rocprofiler_read(rocprofiler_t* handle, uint32_t group_index) { + API_METHOD_PREFIX + rocprofiler::Context* context = reinterpret_cast(handle); + context->Read(group_index); + API_METHOD_SUFFIX +} + +// Get profiling data +PUBLIC_API hsa_status_t rocprofiler_get_data(rocprofiler_t* handle, uint32_t group_index) { + API_METHOD_PREFIX + rocprofiler::Context* context = reinterpret_cast(handle); + context->GetData(group_index); + API_METHOD_SUFFIX +} + +// Start profiling +PUBLIC_API hsa_status_t rocprofiler_group_start(rocprofiler_group_t* group) { + API_METHOD_PREFIX + rocprofiler_start(group->context, group->index); + API_METHOD_SUFFIX +} + +// Stop profiling +PUBLIC_API hsa_status_t rocprofiler_group_stop(rocprofiler_group_t* group) { + API_METHOD_PREFIX + rocprofiler_stop(group->context, group->index); + API_METHOD_SUFFIX +} + +// Read profiling +PUBLIC_API hsa_status_t rocprofiler_group_read(rocprofiler_group_t* group) { + API_METHOD_PREFIX + rocprofiler_read(group->context, group->index); + API_METHOD_SUFFIX +} + +// Get profiling data +PUBLIC_API hsa_status_t rocprofiler_group_get_data(rocprofiler_group_t* group) { + API_METHOD_PREFIX + rocprofiler::Context* context = reinterpret_cast(group->context); + context->GetData(group->index); + API_METHOD_SUFFIX +} + +// Get metrics data +PUBLIC_API hsa_status_t rocprofiler_get_metrics(const rocprofiler_t* handle) { + API_METHOD_PREFIX + const rocprofiler::Context* context = reinterpret_cast(handle); + context->GetMetricsData(); + API_METHOD_SUFFIX +} + +// Set/remove queue callbacks +PUBLIC_API hsa_status_t rocprofiler_set_queue_callbacks(rocprofiler_queue_callbacks_t callbacks, void* data) { + API_METHOD_PREFIX + rocprofiler::InterceptQueue::SetCallbacks(callbacks.dispatch, callbacks.destroy, data); + API_METHOD_SUFFIX +} + +// Remove queue callbacks +PUBLIC_API hsa_status_t rocprofiler_remove_queue_callbacks() { + API_METHOD_PREFIX + rocprofiler::InterceptQueue::SetCallbacks(NULL, NULL, NULL); + API_METHOD_SUFFIX +} + +// Method for iterating the events output data +PUBLIC_API hsa_status_t rocprofiler_iterate_trace_data( + rocprofiler_t* handle, hsa_ven_amd_aqlprofile_data_callback_t callback, void* data) { + API_METHOD_PREFIX + rocprofiler::Context* context = reinterpret_cast(handle); + context->IterateTraceData(callback, data); + API_METHOD_SUFFIX +} + +// Return the info for a given info kind +PUBLIC_API hsa_status_t rocprofiler_get_info( + const hsa_agent_t *agent, + rocprofiler_info_kind_t kind, + void *data) +{ + API_METHOD_PREFIX + if (agent == NULL) EXC_RAISING(HSA_STATUS_ERROR, "NULL agent"); + uint32_t* result_32bit_ptr = reinterpret_cast(data); + + switch (kind) { + case ROCPROFILER_INFO_KIND_METRIC_COUNT: + *result_32bit_ptr = rocprofiler::GetMetrics(*agent)->Size(); + break; + case ROCPROFILER_INFO_KIND_TRACE_COUNT: + *result_32bit_ptr = 1; + break; + default: + EXC_RAISING(HSA_STATUS_ERROR, "unknown info kind(" << kind << ")"); + } + API_METHOD_SUFFIX +} + +// Iterate over the info for a given info kind, and invoke an application-defined callback on every iteration +PUBLIC_API hsa_status_t rocprofiler_iterate_info( + const hsa_agent_t* agent, + rocprofiler_info_kind_t kind, + hsa_status_t (*callback)(const rocprofiler_info_data_t info, void* data), + void* data) +{ + API_METHOD_PREFIX + rocprofiler::util::HsaRsrcFactory* hsa_rsrc = &rocprofiler::util::HsaRsrcFactory::Instance(); + rocprofiler_info_data_t info{}; + info.kind = kind; + uint32_t agent_idx = 0; + uint32_t agent_max = 0; + const rocprofiler::util::AgentInfo* agent_info = NULL; + + if (agent != NULL) { + agent_info = hsa_rsrc->GetAgentInfo(*agent); + agent_idx = agent_info->dev_index; + agent_max = agent_idx + 1; + } + + while (hsa_rsrc->GetGpuAgentInfo(agent_idx, &agent_info)) { + info.agent_index = agent_idx; + + switch (kind) { + case ROCPROFILER_INFO_KIND_METRIC: + { + const rocprofiler::MetricsDict* dict = rocprofiler::GetMetrics(agent_info->dev_id); + auto nodes_vec = dict->GetNodes(agent_info->gfxip); + auto global_vec = dict->GetNodes("global"); + nodes_vec.insert(nodes_vec.end(), global_vec.begin(), global_vec.end()); + + for (auto* node : nodes_vec) { + const std::string& name = node->opts["name"]; + const std::string& descr = node->opts["descr"]; + const std::string& expr = node->opts["expr"]; + info.metric.name = strdup(name.c_str()); + info.metric.description = strdup(descr.c_str()); + info.metric.expr = expr.empty() ? NULL : strdup(expr.c_str()); + status = callback(info, data); + if (status != HSA_STATUS_SUCCESS) break; + } + break; + } + case ROCPROFILER_INFO_KIND_TRACE: + { + info.trace.name = strdup("TT"); + info.trace.description = strdup("Thread Trace"); + info.trace.parameter_count = 5; + status = callback(info, data); + if (status != HSA_STATUS_SUCCESS) break; + break; + } + default: + EXC_RAISING(HSA_STATUS_ERROR, "unknown info kind(" << kind << ")"); + } + + ++agent_idx; + if (agent_idx == agent_max) break; + } + + if (status == HSA_STATUS_INFO_BREAK) status = HSA_STATUS_SUCCESS; + if (status != HSA_STATUS_SUCCESS) ERR_LOGGING("iterate_info error, info kind(" << kind << ")"); + + API_METHOD_SUFFIX +} + +// Iterate over the info for a given info query, and invoke an application-defined callback on every iteration +PUBLIC_API hsa_status_t rocprofiler_query_info( + const hsa_agent_t *agent, + rocprofiler_info_query_t query, + hsa_status_t (*callback)(const rocprofiler_info_data_t info, void *data), + void *data) +{ + API_METHOD_PREFIX + EXC_RAISING(HSA_STATUS_ERROR, "Not implemented"); + API_METHOD_SUFFIX +} + +} // extern "C" diff --git a/src/core/simple_proxy_queue.cpp b/src/core/simple_proxy_queue.cpp new file mode 100644 index 00000000..1c3b5ae1 --- /dev/null +++ b/src/core/simple_proxy_queue.cpp @@ -0,0 +1,40 @@ +/****************************************************************************** +Copyright (c) 2018 Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*******************************************************************************/ + +#include "core/simple_proxy_queue.h" + +namespace rocprofiler { +void SimpleProxyQueue::HsaIntercept(HsaApiTable* table) { + table->core_->hsa_signal_store_relaxed_fn = rocprofiler::SimpleProxyQueue::SignalStore; + table->core_->hsa_signal_store_screlease_fn = rocprofiler::SimpleProxyQueue::SignalStore; + + table->core_->hsa_queue_load_write_index_relaxed_fn = rocprofiler::SimpleProxyQueue::GetQueueIndex; + table->core_->hsa_queue_store_write_index_relaxed_fn = rocprofiler::SimpleProxyQueue::SetQueueIndex; + table->core_->hsa_queue_load_read_index_relaxed_fn = rocprofiler::SimpleProxyQueue::GetSubmitIndex; + + table->core_->hsa_queue_load_write_index_scacquire_fn = rocprofiler::SimpleProxyQueue::GetQueueIndex; + table->core_->hsa_queue_store_write_index_screlease_fn = rocprofiler::SimpleProxyQueue::SetQueueIndex; + table->core_->hsa_queue_load_read_index_scacquire_fn = rocprofiler::SimpleProxyQueue::GetSubmitIndex; +} + +SimpleProxyQueue::queue_map_t* SimpleProxyQueue::queue_map_ = NULL; +} // namespace rocprofiler diff --git a/src/core/simple_proxy_queue.h b/src/core/simple_proxy_queue.h new file mode 100644 index 00000000..8bad833a --- /dev/null +++ b/src/core/simple_proxy_queue.h @@ -0,0 +1,262 @@ +/****************************************************************************** +Copyright (c) 2018 Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*******************************************************************************/ + +#ifndef _SRC_CORE_SIMPLE_PROXY_QUEUE_H +#define _SRC_CORE_SIMPLE_PROXY_QUEUE_H + +#include +#include +#include +#include + +#include "core/proxy_queue.h" +#include "core/types.h" +#include "util/hsa_rsrc_factory.h" + +#ifndef ROCP_PROXY_LOCK +# define ROCP_PROXY_LOCK 1 +#endif + +namespace rocprofiler { +extern decltype(hsa_queue_create)* hsa_queue_create_fn; +extern decltype(hsa_queue_destroy)* hsa_queue_destroy_fn; + +extern decltype(hsa_signal_store_relaxed)* hsa_signal_store_relaxed_fn; +extern decltype(hsa_signal_store_relaxed)* hsa_signal_store_screlease_fn; + +extern decltype(hsa_queue_load_write_index_relaxed)* hsa_queue_load_write_index_relaxed_fn; +extern decltype(hsa_queue_store_write_index_relaxed)* hsa_queue_store_write_index_relaxed_fn; +extern decltype(hsa_queue_load_read_index_relaxed)* hsa_queue_load_read_index_relaxed_fn; + +extern decltype(hsa_queue_load_write_index_scacquire)* hsa_queue_load_write_index_scacquire_fn; +extern decltype(hsa_queue_store_write_index_screlease)* hsa_queue_store_write_index_screlease_fn; +extern decltype(hsa_queue_load_read_index_scacquire)* hsa_queue_load_read_index_scacquire_fn; + +typedef decltype(hsa_signal_t::handle) signal_handle_t; + + +class SimpleProxyQueue : public ProxyQueue { + public: + static void HsaIntercept(HsaApiTable* table); + + static void SignalStore(hsa_signal_t signal, hsa_signal_value_t que_idx) { + auto it = queue_map_->find(signal.handle); + if (it != queue_map_->end()) { + SimpleProxyQueue* instance = it->second; + instance->mutex_lock(); + const uint64_t begin = instance->submit_index_; + const uint64_t end = que_idx + 1; + instance->submit_index_ = end; + instance->mutex_unlock(); + for (uint64_t j = begin; j < end; ++j) { + // Submited packet + const uint32_t idx = j & instance->queue_mask_; + packet_t* packet = reinterpret_cast(instance->queue_->base_address) + idx; + if (instance->on_submit_cb_ != NULL) + instance->on_submit_cb_(packet, 1, j, instance->on_submit_cb_data_, NULL); + else + instance->Submit(packet); + } + } else { + hsa_signal_store_relaxed_fn(signal, que_idx); + } + } + + static uint64_t GetSubmitIndex(const hsa_queue_t* queue) { + uint64_t index = 0; + auto it = queue_map_->find(queue->doorbell_signal.handle); + if (it != queue_map_->end()) { + SimpleProxyQueue* instance = it->second; + index = instance->submit_index_; + } else { + index = hsa_queue_load_read_index_relaxed_fn(queue); + } + return index; + } + + static uint64_t GetQueueIndex(const hsa_queue_t* queue) { + uint64_t index = 0; + auto it = queue_map_->find(queue->doorbell_signal.handle); + if (it != queue_map_->end()) { + SimpleProxyQueue* instance = it->second; + instance->mutex_lock(); + index = instance->queue_index_; + } else { + index = hsa_queue_load_write_index_relaxed_fn(queue); + } + return index; + } + + static void SetQueueIndex(const hsa_queue_t* queue, uint64_t value) { + auto it = queue_map_->find(queue->doorbell_signal.handle); + if (it != queue_map_->end()) { + SimpleProxyQueue* instance = it->second; + instance->queue_index_ = value; + instance->mutex_unlock(); + } else { + hsa_queue_store_write_index_relaxed_fn(queue, value); + } + } + + hsa_status_t SetInterceptCB(on_submit_cb_t on_submit_cb, void* data) { + on_submit_cb_ = on_submit_cb; + on_submit_cb_data_ = data; + return HSA_STATUS_SUCCESS; + } + + void Submit(const packet_t* packet) { + // Compute the write index of queue + const uint64_t que_idx = hsa_queue_load_write_index_relaxed_fn(queue_); + + // Waiting untill there is a free space in the queue + while (que_idx >= (hsa_queue_load_read_index_relaxed_fn(queue_) + size_)); + + // Increment the write index + hsa_queue_store_write_index_relaxed_fn(queue_, que_idx + 1); + + const uint32_t mask = queue_->size - 1; + const uint32_t idx = que_idx & mask; + + // Copy packet to the queue + const packet_word_t* src = reinterpret_cast(packet); + packet_word_t* dst = reinterpret_cast(base_address_ + idx); + for (unsigned i = 1; i < sizeof(packet_t) / sizeof(packet_word_t); ++i) { + dst[i] = src[i]; + } + + // To maintain global order to ensure the prior copy of the packet contents is made visible + // before the header is updated. + // With in-order CP it will wait until the first packet in the blob will be valid. + std::atomic* header_atomic_ptr = + reinterpret_cast*>(&dst[0]); + header_atomic_ptr->store(src[0], std::memory_order_release); + + // Doorbell signaling to submit the packet + hsa_signal_store_relaxed_fn(doorbell_signal_, que_idx); + } + + SimpleProxyQueue() + : agent_info_(NULL), + queue_(NULL), + base_address_(NULL), + doorbell_signal_({}), + queue_index_(0), + queue_mask_(0), + submit_index_(0), + on_submit_cb_(NULL), + on_submit_cb_data_(NULL) + { + printf("ROCProfiler: SimpleProxyQueue is enabled\n"); + fflush(stdout); + } + + ~SimpleProxyQueue() {} + + private: + typedef std::map queue_map_t; + + hsa_status_t Init(hsa_agent_t agent, uint32_t size, hsa_queue_type32_t type, + void (*callback)(hsa_status_t status, hsa_queue_t* source, void* data), + void* data, uint32_t private_segment_size, uint32_t group_segment_size, + hsa_queue_t** queue) { + size_ = size; + auto status = Init(agent, size); + *queue = queue_; + return status; + } + + hsa_status_t Init(hsa_agent_t agent, uint32_t size) { + hsa_status_t status = HSA_STATUS_ERROR; + agent_info_ = util::HsaRsrcFactory::Instance().GetAgentInfo(agent); + if (agent_info_ != NULL) { + if (agent_info_->dev_type == HSA_DEVICE_TYPE_GPU) { + status = hsa_queue_create_fn(agent, size, HSA_QUEUE_TYPE_MULTI, NULL, NULL, UINT32_MAX, + UINT32_MAX, &queue_); + if (status == HSA_STATUS_SUCCESS) { + base_address_ = reinterpret_cast(queue_->base_address); + doorbell_signal_ = queue_->doorbell_signal; + data_array_ = calloc(size + 1, sizeof(packet_t)); + uintptr_t addr = (uintptr_t)data_array_; + queue_->base_address = (void*)((addr + align_mask_) & ~align_mask_); + status = hsa_signal_create(1, 0, NULL, &(queue_->doorbell_signal)); + if (status != HSA_STATUS_SUCCESS) abort(); + queue_mask_ = size - 1; + + if (queue_map_ == NULL) queue_map_ = new queue_map_t; + (*queue_map_)[queue_->doorbell_signal.handle] = this; + } + else abort(); + } + } + if (status != HSA_STATUS_SUCCESS) abort(); + return status; + } + + hsa_status_t Cleanup() const { + hsa_status_t status = HSA_STATUS_ERROR; + hsa_signal_t queue_signal = queue_->doorbell_signal; + + // Destroy original HSA queue + queue_->base_address = base_address_; + queue_->doorbell_signal = doorbell_signal_; + status = hsa_queue_destroy_fn(queue_); + if (status != HSA_STATUS_SUCCESS) abort(); + + // Destroy overloaded virtual queue data and signal + free(data_array_); + status = hsa_signal_destroy(queue_signal); + if (status != HSA_STATUS_SUCCESS) abort(); + + return status; + } + + void mutex_lock() { +#if ROCP_PROXY_LOCK + mutex_.lock(); +#endif + } + + void mutex_unlock() { +#if ROCP_PROXY_LOCK + mutex_.unlock(); +#endif + } + + uint32_t size_; + static queue_map_t* queue_map_; + const util::AgentInfo* agent_info_; + hsa_queue_t* queue_; + static const uintptr_t align_mask_ = sizeof(packet_t) - 1; + packet_t* base_address_; + hsa_signal_t doorbell_signal_; + uint64_t queue_index_; + uint64_t queue_mask_; + uint64_t submit_index_; + std::mutex mutex_; + on_submit_cb_t on_submit_cb_; + void* on_submit_cb_data_; + void* data_array_; +}; + +} // namespace rocprofiler + +#endif // _SRC_CORE_SIMPLE_PROXY_QUEUE_H diff --git a/src/core/tracker.h b/src/core/tracker.h new file mode 100644 index 00000000..eae0c112 --- /dev/null +++ b/src/core/tracker.h @@ -0,0 +1,188 @@ +/****************************************************************************** +Copyright (c) 2018 Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*******************************************************************************/ + +#ifndef SRC_CORE_TRACKER_H_ +#define SRC_CORE_TRACKER_H_ + +#include +#include +#include +#include + +#include +#include + +#include "inc/rocprofiler.h" +#include "util/exception.h" +#include "util/logger.h" + +namespace rocprofiler { + +class Tracker { + public: + typedef uint64_t timestamp_t; + typedef long double freq_t; + typedef std::mutex mutex_t; + typedef rocprofiler_dispatch_record_t record_t; + struct entry_t; + typedef std::list sig_list_t; + struct entry_t { + Tracker* tracker; + sig_list_t::iterator it; + hsa_agent_t agent; + hsa_signal_t orig; + hsa_signal_t signal; + record_t* record; + }; + + Tracker(uint64_t timeout = UINT64_MAX) : timeout_(timeout), outstanding(0) { + timestamp_t timestamp_hz = 0; + hsa_status_t status = hsa_system_get_info(HSA_SYSTEM_INFO_TIMESTAMP_FREQUENCY, ×tamp_hz); + if (status != HSA_STATUS_SUCCESS) EXC_ABORT(status, "hsa_system_get_info(HSA_SYSTEM_INFO_TIMESTAMP_FREQUENCY)"); + timestamp_factor_ = (freq_t)1000000000 / (freq_t)timestamp_hz; + } + ~Tracker() { + mutex_.lock(); + for (entry_t* entry : sig_list_) { + assert(entry != NULL); + while (1) { + const hsa_signal_value_t signal_value = hsa_signal_wait_scacquire( + entry->signal, + HSA_SIGNAL_CONDITION_LT, + 1, + timeout_, + HSA_WAIT_STATE_BLOCKED); + if (signal_value < 1) break; + else WARN_LOGGING("tracker timeout"); + } + Del(entry); + } + mutex_.unlock(); + } + + // Add tracker entry + entry_t* Add(const hsa_agent_t& agent, const hsa_signal_t& orig) { + hsa_status_t status = HSA_STATUS_ERROR; + entry_t* entry = new entry_t{}; + assert(entry); + entry->tracker = this; + mutex_.lock(); + entry->it = sig_list_.insert(sig_list_.begin(), entry); + mutex_.unlock(); + + entry->agent = agent; + entry->orig = orig; + status = hsa_signal_create(1, 0, NULL, &(entry->signal)); + if (status != HSA_STATUS_SUCCESS) EXC_RAISING(status, "hsa_signal_create"); + + record_t* record = new record_t{}; + assert(record); + entry->record = record; + status = hsa_system_get_info(HSA_SYSTEM_INFO_TIMESTAMP, &record->dispatch); + if (status != HSA_STATUS_SUCCESS) EXC_RAISING(status, "hsa_system_get_info(HSA_SYSTEM_INFO_TIMESTAMP)"); + + hsa_amd_signal_async_handler(entry->signal, HSA_SIGNAL_CONDITION_LT, 1, Handler, entry); + if (status != HSA_STATUS_SUCCESS) EXC_RAISING(status, "hsa_amd_signal_async_handler"); + + if (trace_on_) { + mutex_.lock(); + entry->tracker->outstanding++; + fprintf(stdout, "Tracker::Add: entry %p, record %p, outst %lu\n", entry, entry->record, entry->tracker->outstanding); + fflush(stdout); + mutex_.unlock(); + } + + return entry; + } + + private: + // Delete tracker entry + void Del(entry_t* entry) { + hsa_signal_destroy(entry->signal); + mutex_.lock(); + sig_list_.erase(entry->it); + mutex_.unlock(); + delete entry; + } + + // Handler for packet completion + static bool Handler(hsa_signal_value_t value, void* arg) { + entry_t* entry = reinterpret_cast(arg); + record_t* record = entry->record; + + if (trace_on_) { + mutex_.lock(); + entry->tracker->outstanding--; + fprintf(stdout, "Tracker::Handler: entry %p, record %p, outst %lu\n", entry, entry->record, entry->tracker->outstanding); + fflush(stdout); + mutex_.unlock(); + } + + timestamp_t complete_timestamp = 0; + hsa_amd_profiling_dispatch_time_t dispatch_time{}; + + hsa_status_t status = hsa_system_get_info(HSA_SYSTEM_INFO_TIMESTAMP, &complete_timestamp); + if (status != HSA_STATUS_SUCCESS) EXC_RAISING(status, "hsa_system_get_info(HSA_SYSTEM_INFO_TIMESTAMP)"); + status = hsa_amd_profiling_get_dispatch_time(entry->agent, entry->signal, &dispatch_time); + if (status != HSA_STATUS_SUCCESS) EXC_RAISING(status, "hsa_amd_profiling_get_dispatch_time"); + + record->complete = entry->tracker->timestamp2ns(complete_timestamp); + record->begin = entry->tracker->timestamp2ns(dispatch_time.start); + record->end = entry->tracker->timestamp2ns(dispatch_time.end); + + hsa_signal_t orig = entry->orig; + if (orig.handle) { + amd_signal_t* orig_signal_ptr = reinterpret_cast(orig.handle); + amd_signal_t* prof_signal_ptr = reinterpret_cast(entry->signal.handle); + orig_signal_ptr->start_ts = prof_signal_ptr->start_ts; + orig_signal_ptr->end_ts = prof_signal_ptr->end_ts; + + const hsa_signal_value_t value = hsa_signal_load_relaxed(orig); + hsa_signal_store_screlease(orig, value - 1); + } + entry->tracker->Del(entry); + + return false; + } + + inline timestamp_t timestamp2ns(const timestamp_t& timestamp) const { + const freq_t timestamp_ns = (freq_t)timestamp * timestamp_factor_; + return (timestamp_t)timestamp_ns; + } + + // Timestamp frequency factor + freq_t timestamp_factor_; + // Timeout for wait on destruction + timestamp_t timeout_; + // Tracked signals list + sig_list_t sig_list_; + // Inter-thread synchronization + static mutex_t mutex_; + // Outstanding dispatches + uint64_t outstanding; + // Enable tracing + static const bool trace_on_ = false; +}; + +} // namespace rocprofiler + +#endif // SRC_CORE_TRACKER_H_ diff --git a/src/core/types.h b/src/core/types.h new file mode 100644 index 00000000..fd8bae33 --- /dev/null +++ b/src/core/types.h @@ -0,0 +1,37 @@ +/****************************************************************************** +Copyright (c) 2018 Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*******************************************************************************/ + +#ifndef SRC_CORE_TYPES_H_ +#define SRC_CORE_TYPES_H_ + +#include + +namespace rocprofiler { +typedef hsa_ven_amd_aqlprofile_1_00_pfn_t pfn_t; +typedef hsa_ven_amd_aqlprofile_event_t event_t; +typedef hsa_ven_amd_aqlprofile_parameter_t parameter_t; +typedef hsa_ven_amd_aqlprofile_profile_t profile_t; +typedef hsa_ext_amd_aql_pm4_packet_t packet_t; +typedef uint32_t packet_word_t; +} // namespace rocprofiler + +#endif // SRC_CORE_TYPES_H_ diff --git a/src/util/exception.h b/src/util/exception.h new file mode 100644 index 00000000..8af5f980 --- /dev/null +++ b/src/util/exception.h @@ -0,0 +1,72 @@ +/****************************************************************************** +Copyright (c) 2018 Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*******************************************************************************/ + +#ifndef SRC_UTIL_EXCEPTION_H_ +#define SRC_UTIL_EXCEPTION_H_ + +#include + +#include +#include +#include + +#define EXC_ABORT(error, stream) \ + { \ + std::ostringstream oss; \ + oss << __FUNCTION__ << "(), " << stream; \ + std::cout << oss.str() << std::endl; \ + abort(); \ + } + +#define EXC_RAISING(error, stream) \ + { \ + std::ostringstream oss; \ + oss << __FUNCTION__ << "(), " << stream; \ + throw rocprofiler::util::exception(error, oss.str()); \ + } + +#define AQL_EXC_RAISING(error, stream) \ + { \ + const char* error_string = NULL; \ + const rocprofiler::pfn_t* api = util::HsaRsrcFactory::Instance().AqlProfileApi(); \ + api->hsa_ven_amd_aqlprofile_error_string(&error_string); \ + EXC_RAISING(error, stream << ", " << error_string); \ + } + +namespace rocprofiler { +namespace util { + +class exception : public std::exception { + public: + explicit exception(const uint32_t& status, const std::string& msg) : status_(status), str_(msg) {} + const char* what() const throw() { return str_.c_str(); } + uint32_t status() const throw() { return status_; } + + protected: + const uint32_t status_; + const std::string str_; +}; + +} // namespace util +} // namespace rocprofiler + +#endif // SRC_UTIL_EXCEPTION_H_ diff --git a/src/util/hsa_rsrc_factory.cpp b/src/util/hsa_rsrc_factory.cpp new file mode 100644 index 00000000..ff749d15 --- /dev/null +++ b/src/util/hsa_rsrc_factory.cpp @@ -0,0 +1,562 @@ +/****************************************************************************** +Copyright (c) 2018 Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*******************************************************************************/ + +#include "util/hsa_rsrc_factory.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +namespace rocprofiler { +namespace util { + +// Callback function to get available in the system agents +hsa_status_t HsaRsrcFactory::GetHsaAgentsCallback(hsa_agent_t agent, void* data) { + hsa_status_t status = HSA_STATUS_ERROR; + HsaRsrcFactory* hsa_rsrc = reinterpret_cast(data); + const AgentInfo* agent_info = hsa_rsrc->AddAgentInfo(agent); + if (agent_info != NULL) status = HSA_STATUS_SUCCESS; + return status; +} + +// This function checks to see if the provided +// pool has the HSA_AMD_SEGMENT_GLOBAL property. If the kern_arg flag is true, +// the function adds an additional requirement that the pool have the +// HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_KERNARG_INIT property. If kern_arg is false, +// pools must NOT have this property. +// Upon finding a pool that meets these conditions, HSA_STATUS_INFO_BREAK is +// returned. HSA_STATUS_SUCCESS is returned if no errors were encountered, but +// no pool was found meeting the requirements. If an error is encountered, we +// return that error. +static hsa_status_t FindGlobalPool(hsa_amd_memory_pool_t pool, void* data, bool kern_arg) { + hsa_status_t err; + hsa_amd_segment_t segment; + uint32_t flag; + + if (nullptr == data) { + return HSA_STATUS_ERROR_INVALID_ARGUMENT; + } + + err = hsa_amd_memory_pool_get_info(pool, HSA_AMD_MEMORY_POOL_INFO_SEGMENT, &segment); + CHECK_STATUS("hsa_amd_memory_pool_get_info", err); + if (HSA_AMD_SEGMENT_GLOBAL != segment) { + return HSA_STATUS_SUCCESS; + } + + err = hsa_amd_memory_pool_get_info(pool, HSA_AMD_MEMORY_POOL_INFO_GLOBAL_FLAGS, &flag); + CHECK_STATUS("hsa_amd_memory_pool_get_info", err); + + uint32_t karg_st = flag & HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_KERNARG_INIT; + + if ((karg_st == 0 && kern_arg) || (karg_st != 0 && !kern_arg)) { + return HSA_STATUS_SUCCESS; + } + + *(reinterpret_cast(data)) = pool; + return HSA_STATUS_INFO_BREAK; +} + +// This is the call-back function for hsa_amd_agent_iterate_memory_pools() that +// finds a pool with the properties of HSA_AMD_SEGMENT_GLOBAL and that is NOT +// HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_KERNARG_INIT +hsa_status_t FindStandardPool(hsa_amd_memory_pool_t pool, void* data) { + return FindGlobalPool(pool, data, false); +} + +// This is the call-back function for hsa_amd_agent_iterate_memory_pools() that +// finds a pool with the properties of HSA_AMD_SEGMENT_GLOBAL and that IS +// HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_KERNARG_INIT +hsa_status_t FindKernArgPool(hsa_amd_memory_pool_t pool, void* data) { + return FindGlobalPool(pool, data, true); +} + +// Constructor of the class +HsaRsrcFactory::HsaRsrcFactory(bool initialize_hsa) : initialize_hsa_(initialize_hsa) { + hsa_status_t status; + // Initialize the Hsa Runtime + if (initialize_hsa_) { + status = hsa_init(); + CHECK_STATUS("Error in hsa_init", status); + } + // Discover the set of Gpu devices available on the platform + status = hsa_iterate_agents(GetHsaAgentsCallback, this); + CHECK_STATUS("Error Calling hsa_iterate_agents", status); + + // Get AqlProfile API table + aqlprofile_api_ = {0}; +#ifdef ROCP_LD_AQLPROFILE + status = LoadAqlProfileLib(&aqlprofile_api_); +#else + status = hsa_system_get_extension_table(HSA_EXTENSION_AMD_AQLPROFILE, 1, 0, &aqlprofile_api_); +#endif + CHECK_STATUS("aqlprofile API table load failed", status); + + // Get Loader API table + loader_api_ = {0}; + status = hsa_system_get_extension_table(HSA_EXTENSION_AMD_LOADER, 1, 0, &loader_api_); + CHECK_STATUS("loader API table query failed", status); +} + +// Destructor of the class +HsaRsrcFactory::~HsaRsrcFactory() { + for (auto p : cpu_list_) delete p; + for (auto p : gpu_list_) delete p; + if (initialize_hsa_) { + hsa_status_t status = hsa_shut_down(); + CHECK_STATUS("Error in hsa_shut_down", status); + } +} + +hsa_status_t HsaRsrcFactory::LoadAqlProfileLib(aqlprofile_pfn_t* api) { + void* handle = dlopen(kAqlProfileLib, RTLD_NOW); + if (handle == NULL) { + fprintf(stderr, "Loading '%s' failed, %s\n", kAqlProfileLib, dlerror()); + return HSA_STATUS_ERROR; + } + dlerror(); /* Clear any existing error */ + + api->hsa_ven_amd_aqlprofile_error_string = + (decltype(::hsa_ven_amd_aqlprofile_error_string)*)dlsym( + handle, "hsa_ven_amd_aqlprofile_error_string"); + api->hsa_ven_amd_aqlprofile_validate_event = + (decltype(::hsa_ven_amd_aqlprofile_validate_event)*)dlsym( + handle, "hsa_ven_amd_aqlprofile_validate_event"); + api->hsa_ven_amd_aqlprofile_start = + (decltype(::hsa_ven_amd_aqlprofile_start)*)dlsym(handle, "hsa_ven_amd_aqlprofile_start"); + api->hsa_ven_amd_aqlprofile_stop = + (decltype(::hsa_ven_amd_aqlprofile_stop)*)dlsym(handle, "hsa_ven_amd_aqlprofile_stop"); + api->hsa_ven_amd_aqlprofile_read = + (decltype(::hsa_ven_amd_aqlprofile_read)*)dlsym(handle, "hsa_ven_amd_aqlprofile_read"); + api->hsa_ven_amd_aqlprofile_legacy_get_pm4 = + (decltype(::hsa_ven_amd_aqlprofile_legacy_get_pm4)*)dlsym( + handle, "hsa_ven_amd_aqlprofile_legacy_get_pm4"); + api->hsa_ven_amd_aqlprofile_get_info = (decltype(::hsa_ven_amd_aqlprofile_get_info)*)dlsym( + handle, "hsa_ven_amd_aqlprofile_get_info"); + api->hsa_ven_amd_aqlprofile_iterate_data = + (decltype(::hsa_ven_amd_aqlprofile_iterate_data)*)dlsym( + handle, "hsa_ven_amd_aqlprofile_iterate_data"); + + return HSA_STATUS_SUCCESS; +} + +// Add system agent info +const AgentInfo* HsaRsrcFactory::AddAgentInfo(const hsa_agent_t agent) { + // Determine if device is a Gpu agent + hsa_status_t status; + AgentInfo* agent_info = NULL; + + hsa_device_type_t type; + status = hsa_agent_get_info(agent, HSA_AGENT_INFO_DEVICE, &type); + CHECK_STATUS("Error Calling hsa_agent_get_info", status); + + if (type == HSA_DEVICE_TYPE_CPU) { + agent_info = new AgentInfo{}; + agent_info->dev_id = agent; + agent_info->dev_type = HSA_DEVICE_TYPE_CPU; + agent_info->dev_index = cpu_list_.size(); + + status = hsa_amd_agent_iterate_memory_pools(agent, FindStandardPool, &agent_info->cpu_pool); + CHECK_ITER_STATUS("hsa_amd_agent_iterate_memory_pools(cpu pool)", status); + status = hsa_amd_agent_iterate_memory_pools(agent, FindKernArgPool, &agent_info->kern_arg_pool); + CHECK_ITER_STATUS("hsa_amd_agent_iterate_memory_pools(kern arg pool)", status); + agent_info->gpu_pool = {}; + + cpu_list_.push_back(agent_info); + cpu_agents_.push_back(agent); + } + + if (type == HSA_DEVICE_TYPE_GPU) { + agent_info = new AgentInfo{}; + agent_info->dev_id = agent; + agent_info->dev_type = HSA_DEVICE_TYPE_GPU; + hsa_agent_get_info(agent, HSA_AGENT_INFO_NAME, agent_info->name); + strncpy(agent_info->gfxip, agent_info->name, 4); + agent_info->gfxip[4] = '\0'; + hsa_agent_get_info(agent, HSA_AGENT_INFO_WAVEFRONT_SIZE, &agent_info->max_wave_size); + hsa_agent_get_info(agent, HSA_AGENT_INFO_QUEUE_MAX_SIZE, &agent_info->max_queue_size); + hsa_agent_get_info(agent, HSA_AGENT_INFO_PROFILE, &agent_info->profile); + agent_info->is_apu = (agent_info->profile == HSA_PROFILE_FULL) ? true : false; + hsa_agent_get_info(agent, static_cast(HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT), + &agent_info->cu_num); + hsa_agent_get_info(agent, static_cast(HSA_AMD_AGENT_INFO_MAX_WAVES_PER_CU), + &agent_info->waves_per_cu); + hsa_agent_get_info(agent, static_cast(HSA_AMD_AGENT_INFO_NUM_SIMDS_PER_CU), + &agent_info->simds_per_cu); + hsa_agent_get_info(agent, static_cast(HSA_AMD_AGENT_INFO_NUM_SHADER_ENGINES), + &agent_info->se_num); + hsa_agent_get_info(agent, + static_cast(HSA_AMD_AGENT_INFO_NUM_SHADER_ARRAYS_PER_SE), + &agent_info->shader_arrays_per_se); + + agent_info->cpu_pool = {}; + agent_info->kern_arg_pool = {}; + status = hsa_amd_agent_iterate_memory_pools(agent, FindStandardPool, &agent_info->gpu_pool); + CHECK_ITER_STATUS("hsa_amd_agent_iterate_memory_pools(gpu pool)", status); + + // Set GPU index + agent_info->dev_index = gpu_list_.size(); + gpu_list_.push_back(agent_info); + gpu_agents_.push_back(agent); + } + + if (agent_info) agent_map_[agent.handle] = agent_info; + + return agent_info; +} + +// Return systen agent info +const AgentInfo* HsaRsrcFactory::GetAgentInfo(const hsa_agent_t agent) { + const AgentInfo* agent_info = NULL; + auto it = agent_map_.find(agent.handle); + if (it != agent_map_.end()) { + agent_info = it->second; + } + return agent_info; +} + +// Get the count of Hsa Gpu Agents available on the platform +// +// @return uint32_t Number of Gpu agents on platform +// +uint32_t HsaRsrcFactory::GetCountOfGpuAgents() { return uint32_t(gpu_list_.size()); } + +// Get the count of Hsa Cpu Agents available on the platform +// +// @return uint32_t Number of Cpu agents on platform +// +uint32_t HsaRsrcFactory::GetCountOfCpuAgents() { return uint32_t(cpu_list_.size()); } + +// Get the AgentInfo handle of a Gpu device +// +// @param idx Gpu Agent at specified index +// +// @param agent_info Output parameter updated with AgentInfo +// +// @return bool true if successful, false otherwise +// +bool HsaRsrcFactory::GetGpuAgentInfo(uint32_t idx, const AgentInfo** agent_info) { + // Determine if request is valid + uint32_t size = uint32_t(gpu_list_.size()); + if (idx >= size) { + return false; + } + + // Copy AgentInfo from specified index + *agent_info = gpu_list_[idx]; + + return true; +} + +// Get the AgentInfo handle of a Cpu device +// +// @param idx Cpu Agent at specified index +// +// @param agent_info Output parameter updated with AgentInfo +// +// @return bool true if successful, false otherwise +// +bool HsaRsrcFactory::GetCpuAgentInfo(uint32_t idx, const AgentInfo** agent_info) { + // Determine if request is valid + uint32_t size = uint32_t(cpu_list_.size()); + if (idx >= size) { + return false; + } + + // Copy AgentInfo from specified index + *agent_info = cpu_list_[idx]; + return true; +} + +// Create a Queue object and return its handle. The queue object is expected +// to support user requested number of Aql dispatch packets. +// +// @param agent_info Gpu Agent on which to create a queue object +// +// @param num_Pkts Number of packets to be held by queue +// +// @param queue Output parameter updated with handle of queue object +// +// @return bool true if successful, false otherwise +// +bool HsaRsrcFactory::CreateQueue(const AgentInfo* agent_info, uint32_t num_pkts, + hsa_queue_t** queue) { + hsa_status_t status; + status = hsa_queue_create(agent_info->dev_id, num_pkts, HSA_QUEUE_TYPE_MULTI, NULL, NULL, + UINT32_MAX, UINT32_MAX, queue); + return (status == HSA_STATUS_SUCCESS); +} + +// Create a Signal object and return its handle. +// @param value Initial value of signal object +// @param signal Output parameter updated with handle of signal object +// @return bool true if successful, false otherwise +bool HsaRsrcFactory::CreateSignal(uint32_t value, hsa_signal_t* signal) { + hsa_status_t status; + status = hsa_signal_create(value, 0, NULL, signal); + return (status == HSA_STATUS_SUCCESS); +} + +// Allocate memory for use by a kernel of specified size in specified +// agent's memory region. +// @param agent_info Agent from whose memory region to allocate +// @param size Size of memory in terms of bytes +// @return uint8_t* Pointer to buffer, null if allocation fails. +uint8_t* HsaRsrcFactory::AllocateLocalMemory(const AgentInfo* agent_info, size_t size) { + hsa_status_t status = HSA_STATUS_ERROR; + uint8_t* buffer = NULL; + size = (size + MEM_PAGE_MASK) & ~MEM_PAGE_MASK; + status = hsa_amd_memory_pool_allocate(agent_info->gpu_pool, size, 0, reinterpret_cast(&buffer)); + uint8_t* ptr = (status == HSA_STATUS_SUCCESS) ? buffer : NULL; + return ptr; +} + +// Allocate memory to pass kernel parameters. +// Memory is alocated accessible for all CPU agents and for GPU given by AgentInfo parameter. +// @param agent_info Agent from whose memory region to allocate +// @param size Size of memory in terms of bytes +// @return uint8_t* Pointer to buffer, null if allocation fails. +uint8_t* HsaRsrcFactory::AllocateKernArgMemory(const AgentInfo* agent_info, size_t size) { + hsa_status_t status = HSA_STATUS_ERROR; + uint8_t* buffer = NULL; + if (!cpu_agents_.empty()) { + size = (size + MEM_PAGE_MASK) & ~MEM_PAGE_MASK; + status = hsa_amd_memory_pool_allocate(cpu_list_[0]->kern_arg_pool, size, 0, reinterpret_cast(&buffer)); + // Both the CPU and GPU can access the kernel arguments + if (status == HSA_STATUS_SUCCESS) { + hsa_agent_t ag_list[1] = {agent_info->dev_id}; + status = hsa_amd_agents_allow_access(1, ag_list, NULL, buffer); + } + } + uint8_t* ptr = (status == HSA_STATUS_SUCCESS) ? buffer : NULL; + return ptr; +} + +// Allocate system memory accessible by both CPU and GPU +// @param agent_info Agent from whose memory region to allocate +// @param size Size of memory in terms of bytes +// @return uint8_t* Pointer to buffer, null if allocation fails. +uint8_t* HsaRsrcFactory::AllocateSysMemory(const AgentInfo* agent_info, size_t size) { + hsa_status_t status = HSA_STATUS_ERROR; + uint8_t* buffer = NULL; + size = (size + MEM_PAGE_MASK) & ~MEM_PAGE_MASK; + if (!cpu_agents_.empty()) { + status = hsa_amd_memory_pool_allocate(cpu_list_[0]->cpu_pool, size, 0, reinterpret_cast(&buffer)); + // Both the CPU and GPU can access the memory + if (status == HSA_STATUS_SUCCESS) { + hsa_agent_t ag_list[1] = {agent_info->dev_id}; + status = hsa_amd_agents_allow_access(1, ag_list, NULL, buffer); + } + } + uint8_t* ptr = (status == HSA_STATUS_SUCCESS) ? buffer : NULL; + return ptr; +} + +// Allocate memory for command buffer. +// @param agent_info Agent from whose memory region to allocate +// @param size Size of memory in terms of bytes +// @return uint8_t* Pointer to buffer, null if allocation fails. +uint8_t* HsaRsrcFactory::AllocateCmdMemory(const AgentInfo* agent_info, size_t size) { + size = (size + MEM_PAGE_MASK) & ~MEM_PAGE_MASK; + uint8_t* ptr = (agent_info->is_apu && CMD_MEMORY_MMAP) + ? reinterpret_cast( + mmap(NULL, size, PROT_READ | PROT_WRITE | PROT_EXEC, MAP_SHARED | MAP_ANONYMOUS, 0, 0)) + : AllocateSysMemory(agent_info, size); + return ptr; +} + +// Copy data from GPU to host memory +bool HsaRsrcFactory::Memcpy(const hsa_agent_t& agent, void* dst, const void* src, size_t size) { + hsa_status_t status = HSA_STATUS_ERROR; + if (!cpu_agents_.empty()) { + hsa_signal_t s = {}; + status = hsa_signal_create(1, 0, NULL, &s); + if (status == HSA_STATUS_SUCCESS) { + status = hsa_amd_memory_async_copy(dst, cpu_agents_[0], src, agent, size, 0, NULL, s); + if (status == HSA_STATUS_SUCCESS) { + if (hsa_signal_wait_scacquire(s, HSA_SIGNAL_CONDITION_LT, 1, UINT64_MAX, + HSA_WAIT_STATE_BLOCKED) != 0) { + status = HSA_STATUS_ERROR; + } + } + status = hsa_signal_destroy(s); + } + } + return (status == HSA_STATUS_SUCCESS); +} +bool HsaRsrcFactory::Memcpy(const AgentInfo* agent_info, void* dst, const void* src, size_t size) { + return Memcpy(agent_info->dev_id, dst, src, size); +} + +// Memory free method +bool HsaRsrcFactory::FreeMemory(void* ptr) { + const hsa_status_t status = hsa_memory_free(ptr); + CHECK_STATUS("hsa_memory_free", status); + return (status == HSA_STATUS_SUCCESS); +} + +// Loads an Assembled Brig file and Finalizes it into Device Isa +// @param agent_info Gpu device for which to finalize +// @param brig_path File path of the Assembled Brig file +// @param kernel_name Name of the kernel to finalize +// @param code_desc Handle of finalized Code Descriptor that could +// be used to submit for execution +// @return bool true if successful, false otherwise +bool HsaRsrcFactory::LoadAndFinalize(const AgentInfo* agent_info, const char* brig_path, + const char* kernel_name, hsa_executable_t* executable, + hsa_executable_symbol_t* code_desc) { + hsa_status_t status = HSA_STATUS_ERROR; + + // Build the code object filename + std::string filename(brig_path); + std::clog << "Code object filename: " << filename << std::endl; + + // Open the file containing code object + hsa_file_t file_handle = open(filename.c_str(), O_RDONLY); + if (file_handle == -1) { + std::cerr << "Error: failed to load '" << filename << "'" << std::endl; + assert(false); + return false; + } + + // Create code object reader + hsa_code_object_reader_t code_obj_rdr = {0}; + status = hsa_code_object_reader_create_from_file(file_handle, &code_obj_rdr); + if (status != HSA_STATUS_SUCCESS) { + std::cerr << "Failed to create code object reader '" << filename << "'" << std::endl; + return false; + } + + // Create executable. + status = hsa_executable_create_alt(HSA_PROFILE_FULL, HSA_DEFAULT_FLOAT_ROUNDING_MODE_DEFAULT, + NULL, executable); + CHECK_STATUS("Error in creating executable object", status); + + // Load code object. + status = hsa_executable_load_agent_code_object(*executable, agent_info->dev_id, code_obj_rdr, + NULL, NULL); + CHECK_STATUS("Error in loading executable object", status); + + // Freeze executable. + status = hsa_executable_freeze(*executable, ""); + CHECK_STATUS("Error in freezing executable object", status); + + // Get symbol handle. + hsa_executable_symbol_t kernelSymbol; + status = hsa_executable_get_symbol(*executable, NULL, kernel_name, agent_info->dev_id, 0, + &kernelSymbol); + CHECK_STATUS("Error in looking up kernel symbol", status); + + // Update output parameter + *code_desc = kernelSymbol; + return true; +} + +// Print the various fields of Hsa Gpu Agents +bool HsaRsrcFactory::PrintGpuAgents(const std::string& header) { + std::clog << header << " :" << std::endl; + + const AgentInfo* agent_info; + int size = uint32_t(gpu_list_.size()); + for (int idx = 0; idx < size; idx++) { + agent_info = gpu_list_[idx]; + + std::clog << "> agent[" << idx << "] :" << std::endl; + std::clog << ">> Name : " << agent_info->name << std::endl; + std::clog << ">> APU : " << agent_info->is_apu << std::endl; + std::clog << ">> HSAIL profile : " << agent_info->profile << std::endl; + std::clog << ">> Max Wave Size : " << agent_info->max_wave_size << std::endl; + std::clog << ">> Max Queue Size : " << agent_info->max_queue_size << std::endl; + std::clog << ">> CU number : " << agent_info->cu_num << std::endl; + std::clog << ">> Waves per CU : " << agent_info->waves_per_cu << std::endl; + std::clog << ">> SIMDs per CU : " << agent_info->simds_per_cu << std::endl; + std::clog << ">> SE number : " << agent_info->se_num << std::endl; + std::clog << ">> Shader Arrays per SE : " << agent_info->shader_arrays_per_se << std::endl; + } + return true; +} + +uint64_t HsaRsrcFactory::Submit(hsa_queue_t* queue, const void* packet) { + const uint32_t slot_size_b = 0x40; + + // adevance command queue + const uint64_t write_idx = hsa_queue_load_write_index_relaxed(queue); + hsa_queue_store_write_index_relaxed(queue, write_idx + 1); + while ((write_idx - hsa_queue_load_read_index_relaxed(queue)) >= queue->size) { + sched_yield(); + } + + uint32_t slot_idx = (uint32_t)(write_idx % queue->size); + uint32_t* queue_slot = reinterpret_cast((uintptr_t)(queue->base_address) + (slot_idx * slot_size_b)); + const uint32_t* slot_data = reinterpret_cast(packet); + + // Copy buffered commands into the queue slot. + // Overwrite the AQL invalid header (first dword) last. + // This prevents the slot from being read until it's fully written. + memcpy(&queue_slot[1], &slot_data[1], slot_size_b - sizeof(uint32_t)); + std::atomic* header_atomic_ptr = + reinterpret_cast*>(&queue_slot[0]); + header_atomic_ptr->store(slot_data[0], std::memory_order_release); + + // ringdoor bell + hsa_signal_store_relaxed(queue->doorbell_signal, write_idx); + + return write_idx; +} + +uint64_t HsaRsrcFactory::Submit(hsa_queue_t* queue, const void* packet, size_t size_bytes) { + const uint32_t slot_size_b = 0x40; + if ((size_bytes & (slot_size_b - 1)) != 0) { + fprintf(stderr, "HsaRsrcFactory::Submit: Bad packet size %zx\n", size_bytes); + abort(); + } + + const char* begin = reinterpret_cast(packet); + const char* end = begin + size_bytes; + uint64_t write_idx = 0; + for (const char* ptr = begin; ptr < end; ptr += slot_size_b) { + write_idx = Submit(queue, ptr); + } + + return write_idx; +} + +HsaRsrcFactory* HsaRsrcFactory::instance_ = NULL; +HsaRsrcFactory::mutex_t HsaRsrcFactory::mutex_; + +} // namespace util +} // namespace rocprofiler diff --git a/src/util/hsa_rsrc_factory.h b/src/util/hsa_rsrc_factory.h new file mode 100644 index 00000000..b00ee8ed --- /dev/null +++ b/src/util/hsa_rsrc_factory.h @@ -0,0 +1,288 @@ +/****************************************************************************** +Copyright (c) 2018 Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*******************************************************************************/ + +#ifndef SRC_UTIL_HSA_RSRC_FACTORY_H_ +#define SRC_UTIL_HSA_RSRC_FACTORY_H_ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +#define HSA_ARGUMENT_ALIGN_BYTES 16 +#define HSA_QUEUE_ALIGN_BYTES 64 +#define HSA_PACKET_ALIGN_BYTES 64 + +#define CHECK_STATUS(msg, status) \ + if (status != HSA_STATUS_SUCCESS) { \ + const char* emsg = 0; \ + hsa_status_string(status, &emsg); \ + printf("%s: %s\n", msg, emsg ? emsg : ""); \ + exit(1); \ + } + +#define CHECK_ITER_STATUS(msg, status) \ + if (status != HSA_STATUS_INFO_BREAK) { \ + const char* emsg = 0; \ + hsa_status_string(status, &emsg); \ + printf("%s: %s\n", msg, emsg ? emsg : ""); \ + exit(1); \ + } + +namespace rocprofiler { +namespace util { +static const size_t MEM_PAGE_BYTES = 0x1000; +static const size_t MEM_PAGE_MASK = MEM_PAGE_BYTES - 1; +typedef decltype(hsa_agent_t::handle) hsa_agent_handle_t; + +// Encapsulates information about a Hsa Agent such as its +// handle, name, max queue size, max wavefront size, etc. +struct AgentInfo { + // Handle of Agent + hsa_agent_t dev_id; + + // Agent type - Cpu = 0, Gpu = 1 or Dsp = 2 + uint32_t dev_type; + + // APU flag + bool is_apu; + + // Agent system index + uint32_t dev_index; + + // GFXIP name + char gfxip[64]; + + // Name of Agent whose length is less than 64 + char name[64]; + + // Max size of Wavefront size + uint32_t max_wave_size; + + // Max size of Queue buffer + uint32_t max_queue_size; + + // Hsail profile supported by agent + hsa_profile_t profile; + + // CPU/GPU/kern-arg memory pools + hsa_amd_memory_pool_t cpu_pool; + hsa_amd_memory_pool_t gpu_pool; + hsa_amd_memory_pool_t kern_arg_pool; + + // The number of compute unit available in the agent. + uint32_t cu_num; + + // Maximum number of waves possible in a Compute Unit. + uint32_t waves_per_cu; + + // Number of SIMD's per compute unit CU + uint32_t simds_per_cu; + + // Number of Shader Engines (SE) in Gpu + uint32_t se_num; + + // Number of Shader Arrays Per Shader Engines in Gpu + uint32_t shader_arrays_per_se; +}; + +class HsaRsrcFactory { + public: + typedef std::recursive_mutex mutex_t; + + static HsaRsrcFactory* Create(bool initialize_hsa = true) { + std::lock_guard lck(mutex_); + if (instance_ == NULL) { + instance_ = new HsaRsrcFactory(initialize_hsa); + } + return instance_; + } + + static HsaRsrcFactory& Instance() { + if (instance_ == NULL) instance_ = Create(false); + hsa_status_t status = (instance_ != NULL) ? HSA_STATUS_SUCCESS : HSA_STATUS_ERROR; + CHECK_STATUS("HsaRsrcFactory::Instance() failed", status); + return *instance_; + } + + static void Destroy() { + std::lock_guard lck(mutex_); + if (instance_) delete instance_; + instance_ = NULL; + } + + // Return system agent info + const AgentInfo* GetAgentInfo(const hsa_agent_t agent); + + // Get the count of Hsa Gpu Agents available on the platform + // @return uint32_t Number of Gpu agents on platform + uint32_t GetCountOfGpuAgents(); + + // Get the count of Hsa Cpu Agents available on the platform + // @return uint32_t Number of Cpu agents on platform + uint32_t GetCountOfCpuAgents(); + + // Get the AgentInfo handle of a Gpu device + // @param idx Gpu Agent at specified index + // @param agent_info Output parameter updated with AgentInfo + // @return bool true if successful, false otherwise + bool GetGpuAgentInfo(uint32_t idx, const AgentInfo** agent_info); + + // Get the AgentInfo handle of a Cpu device + // @param idx Cpu Agent at specified index + // @param agent_info Output parameter updated with AgentInfo + // @return bool true if successful, false otherwise + bool GetCpuAgentInfo(uint32_t idx, const AgentInfo** agent_info); + + // Create a Queue object and return its handle. The queue object is expected + // to support user requested number of Aql dispatch packets. + // @param agent_info Gpu Agent on which to create a queue object + // @param num_Pkts Number of packets to be held by queue + // @param queue Output parameter updated with handle of queue object + // @return bool true if successful, false otherwise + bool CreateQueue(const AgentInfo* agent_info, uint32_t num_pkts, hsa_queue_t** queue); + + // Create a Signal object and return its handle. + // @param value Initial value of signal object + // @param signal Output parameter updated with handle of signal object + // @return bool true if successful, false otherwise + bool CreateSignal(uint32_t value, hsa_signal_t* signal); + + // Allocate local GPU memory + // @param agent_info Agent from whose memory region to allocate + // @param size Size of memory in terms of bytes + // @return uint8_t* Pointer to buffer, null if allocation fails. + uint8_t* AllocateLocalMemory(const AgentInfo* agent_info, size_t size); + + // Allocate memory tp pass kernel parameters + // Memory is alocated accessible for all CPU agents and for GPU given by AgentInfo parameter. + // @param agent_info Agent from whose memory region to allocate + // @param size Size of memory in terms of bytes + // @return uint8_t* Pointer to buffer, null if allocation fails. + uint8_t* AllocateKernArgMemory(const AgentInfo* agent_info, size_t size); + + // Allocate system memory accessible from both CPU and GPU + // Memory is alocated accessible to all CPU agents and AgentInfo parameter is ignored. + // @param agent_info Agent from whose memory region to allocate + // @param size Size of memory in terms of bytes + // @return uint8_t* Pointer to buffer, null if allocation fails. + uint8_t* AllocateSysMemory(const AgentInfo* agent_info, size_t size); + + // Allocate memory for command buffer. + // @param agent_info Agent from whose memory region to allocate + // @param size Size of memory in terms of bytes + // @return uint8_t* Pointer to buffer, null if allocation fails. + uint8_t* AllocateCmdMemory(const AgentInfo* agent_info, size_t size); + + // Copy data from GPU to host memory + bool Memcpy(const hsa_agent_t& agent, void* dst, const void* src, size_t size); + bool Memcpy(const AgentInfo* agent_info, void* dst, const void* src, size_t size); + + // Memory free method + static bool FreeMemory(void* ptr); + + // Loads an Assembled Brig file and Finalizes it into Device Isa + // @param agent_info Gpu device for which to finalize + // @param brig_path File path of the Assembled Brig file + // @param kernel_name Name of the kernel to finalize + // @param code_desc Handle of finalized Code Descriptor that could + // be used to submit for execution + // @return true if successful, false otherwise + bool LoadAndFinalize(const AgentInfo* agent_info, const char* brig_path, const char* kernel_name, + hsa_executable_t* hsa_exec, hsa_executable_symbol_t* code_desc); + + // Print the various fields of Hsa Gpu Agents + bool PrintGpuAgents(const std::string& header); + + // Submit AQL packet to given queue + static uint64_t Submit(hsa_queue_t* queue, const void* packet); + static uint64_t Submit(hsa_queue_t* queue, const void* packet, size_t size_bytes); + + // Return AqlProfile API table + typedef hsa_ven_amd_aqlprofile_1_00_pfn_t aqlprofile_pfn_t; + const aqlprofile_pfn_t* AqlProfileApi() const { return &aqlprofile_api_; } + + // Return Loader API table + const hsa_ven_amd_loader_1_00_pfn_t* LoaderApi() const { return &loader_api_; } + + private: + // System agents iterating callback + static hsa_status_t GetHsaAgentsCallback(hsa_agent_t agent, void* data); + + // Callback function to find and bind kernarg region of an agent + static hsa_status_t FindMemRegionsCallback(hsa_region_t region, void* data); + + // Load AQL profile HSA extension library directly + static hsa_status_t LoadAqlProfileLib(aqlprofile_pfn_t* api); + + // Constructor of the class. Will initialize the Hsa Runtime and + // query the system topology to get the list of Cpu and Gpu devices + explicit HsaRsrcFactory(bool initialize_hsa); + + // Destructor of the class + ~HsaRsrcFactory(); + + // Add an instance of AgentInfo representing a Hsa Gpu agent + const AgentInfo* AddAgentInfo(const hsa_agent_t agent); + + // To mmap command buffer memory + static const bool CMD_MEMORY_MMAP = false; + + // HSA was initialized + const bool initialize_hsa_; + + static HsaRsrcFactory* instance_; + static mutex_t mutex_; + + // Used to maintain a list of Hsa Gpu Agent Info + std::vector gpu_list_; + std::vector gpu_agents_; + + // Used to maintain a list of Hsa Cpu Agent Info + std::vector cpu_list_; + std::vector cpu_agents_; + + // System agents map + std::map agent_map_; + + // AqlProfile API table + aqlprofile_pfn_t aqlprofile_api_; + + // Loader API table + hsa_ven_amd_loader_1_00_pfn_t loader_api_; +}; + +} // namespace util +} // namespace rocprofiler + +#endif // SRC_UTIL_HSA_RSRC_FACTORY_H_ diff --git a/src/util/logger.h b/src/util/logger.h new file mode 100644 index 00000000..97477899 --- /dev/null +++ b/src/util/logger.h @@ -0,0 +1,191 @@ +/****************************************************************************** +Copyright (c) 2018 Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*******************************************************************************/ + +#ifndef SRC_UTIL_LOGGER_H_ +#define SRC_UTIL_LOGGER_H_ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +namespace rocprofiler { +namespace util { + +class Logger { + public: + typedef std::recursive_mutex mutex_t; + + template Logger& operator<<(const T& m) { + std::ostringstream oss; + oss << m; + if (!streaming_) + Log(oss.str()); + else + Put(oss.str()); + streaming_ = true; + return *this; + } + + typedef void (*manip_t)(); + Logger& operator<<(manip_t f) { + f(); + return *this; + } + + static void begm() { Instance().ResetStreaming(true); } + static void endl() { Instance().ResetStreaming(false); } + + static const std::string& LastMessage() { + Logger& logger = Instance(); + std::lock_guard lck(mutex_); + return logger.message_[GetTid()]; + } + + static Logger* Create() { + std::lock_guard lck(mutex_); + if (instance_ == NULL) instance_ = new Logger(); + return instance_; + } + + static void Destroy() { + std::lock_guard lck(mutex_); + if (instance_ != NULL) delete instance_; + instance_ = NULL; + } + + static Logger& Instance() { + Create(); + return *instance_; + } + + private: + static uint32_t GetPid() { return syscall(__NR_getpid); } + static uint32_t GetTid() { return syscall(__NR_gettid); } + + Logger() : file_(NULL), dirty_(false), streaming_(false), messaging_(false) { + const char* path = getenv("ROCPROFILER_LOG"); + if (path != NULL) { + file_ = fopen("/tmp/rocprofiler_log.txt", "a"); + } + ResetStreaming(false); + } + + ~Logger() { + if (file_ != NULL) { + if (dirty_) Put("\n"); + fclose(file_); + } + } + + void ResetStreaming(const bool messaging) { + std::lock_guard lck(mutex_); + if (messaging) { + message_[GetTid()] = ""; + } else if (streaming_) { + Put("\n"); + dirty_ = false; + } + messaging_ = messaging; + streaming_ = messaging; + } + + void Put(const std::string& m) { + std::lock_guard lck(mutex_); + if (messaging_) { + message_[GetTid()] += m; + } + if (file_ != NULL) { + dirty_ = true; + flock(fileno(file_), LOCK_EX); + fprintf(file_, "%s", m.c_str()); + fflush(file_); + flock(fileno(file_), LOCK_UN); + } + } + + void Log(const std::string& m) { + const time_t rawtime = time(NULL); + tm tm_info; + localtime_r(&rawtime, &tm_info); + char tm_str[26]; + strftime(tm_str, 26, "%Y-%m-%d %H:%M:%S", &tm_info); + std::ostringstream oss; + oss << "<" << tm_str << std::dec << " pid" << GetPid() << " tid" << GetTid() << "> " << m; + Put(oss.str()); + } + + FILE* file_; + bool dirty_; + bool streaming_; + bool messaging_; + + static mutex_t mutex_; + static Logger* instance_; + std::map message_; +}; + +} // namespace util +} // namespace rocprofiler + +#define ERR_LOGGING(stream) \ + { \ + rocprofiler::util::Logger::Instance() << "error: " << rocprofiler::util::Logger::begm \ + << stream << rocprofiler::util::Logger::endl; \ + } + +#define INFO_LOGGING(stream) \ + { \ + rocprofiler::util::Logger::Instance() << "info: " << rocprofiler::util::Logger::begm << stream \ + << rocprofiler::util::Logger::endl; \ + } + +#define WARN_LOGGING(stream) \ + { \ + std::cerr << "ROCProfiler: " << stream << std::endl; \ + rocprofiler::util::Logger::Instance() << "warning: " << rocprofiler::util::Logger::begm << stream \ + << rocprofiler::util::Logger::endl; \ + } + +#ifdef DEBUG +#define DBG_LOGGING(stream) \ + { \ + rocprofiler::util::Logger::Instance() << rocprofiler::util::Logger::begm << "debug: \"" \ + << stream << "\"" < < < < \ + " in " << __FUNCTION__ << " at " << __FILE__ << " line " << __LINE__ \ + << rocprofiler::util::Logger::endl; \ + } +#endif + +#endif // SRC_UTIL_LOGGER_H_ diff --git a/src/xml/expr.h b/src/xml/expr.h new file mode 100644 index 00000000..731e25e4 --- /dev/null +++ b/src/xml/expr.h @@ -0,0 +1,446 @@ +/****************************************************************************** +Copyright (c) 2018 Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*******************************************************************************/ + +#ifndef _SRC_XML_EXPR_H +#define _SRC_XML_EXPR_H + +#include +#include +#include +#include +#include +#include + +namespace xml { +class exception_t : public std::exception { + public: + explicit exception_t(const std::string& msg) : str_(msg) {} + const char* what() const throw() { return str_.c_str(); } + + protected: + const std::string str_; +}; + +class div_zero_exception_t : public exception_t { + public: + explicit div_zero_exception_t(const std::string& msg) : exception_t("Divide by zero exception " + msg) {} +}; + +typedef uint64_t args_t; +static const args_t ARGS_MAX = UINT64_MAX; +typedef std::map args_map_t; +class Expr; + +template class any_cache_t { + public: + virtual ~any_cache_t() {} + virtual bool Lookup(const std::string& name, T& result) const = 0; +}; + +typedef any_cache_t expr_cache_t; +typedef any_cache_t args_cache_t; + +class bin_expr_t { + public: + static const bin_expr_t* CreateExpr(const bin_expr_t* arg1, const bin_expr_t* arg2, + const char op); + static const bin_expr_t* CreateArg(Expr* obj, const std::string str); + + bin_expr_t() : arg1_(NULL), arg2_(NULL) {} + bin_expr_t(const bin_expr_t* arg1, const bin_expr_t* arg2) : arg1_(arg1), arg2_(arg2) {} + virtual ~bin_expr_t() { + if (arg1_) delete arg1_; + if (arg2_) delete arg2_; + } + + virtual args_t Eval(const args_cache_t& args) const = 0; + virtual std::string Symbol() const = 0; + + std::string String() const { + std::string str; + if (arg1_) { + str = "(" + arg1_->String() + " " + Symbol() + " " + arg2_->String() + ")"; + } else + str = Symbol(); + return str; + } + + protected: + const bin_expr_t* arg1_; + const bin_expr_t* arg2_; +}; + +class Expr { + public: + explicit Expr(const std::string& expr, const expr_cache_t* cache) + : expr_(expr), pos_(0), sub_count_(0), cache_(cache), is_sub_expr_(false) + { + sub_vec_ = new std::vector; + var_vec_ = new std::vector; + tree_ = ParseExpr(); + } + + explicit Expr(const std::string& expr, const Expr* obj) + : expr_(expr), + pos_(0), + sub_count_(0), + cache_(obj->cache_), + sub_vec_(obj->sub_vec_), + var_vec_(obj->var_vec_), + is_sub_expr_(true) + { + sub_vec_->push_back(this); + tree_ = ParseExpr(); + if (!SubCheck()) throw exception_t("expr '" + expr_ + "', bad parenthesis count"); + } + + ~Expr() { + if (!is_sub_expr_) { + delete cache_; + for (auto it : *sub_vec_) delete it; + delete sub_vec_; + delete var_vec_; + delete tree_; + } + } + + std::string GetStr() const { return expr_; } + const expr_cache_t* GetCache() const { return cache_; } + const bin_expr_t* GetTree() const { return tree_; } + + args_t Eval(const args_cache_t& args) const { + args_t result = 0; + try { + result = tree_->Eval(args); + } catch (const div_zero_exception_t& e) { + if (div_zero_exc_on) std::cout << "Expr::Eval() exc(" << e.what() << ") : " << String() << std::endl; + } catch (const exception_t& e) { + throw e; + } + return result; + } + + std::string Lookup(const std::string& str) const { + std::string result; + if (cache_ && !(cache_->Lookup(str, result))) + throw exception_t("expr '" + expr_ + "', lookup '" + str + "' failed"); + return result; + } + + void AddVar(const std::string& str) { + bool found = false; + for (std::string s : *var_vec_) + if (s == str) found = true; + if (!found) var_vec_->push_back(str); + } + + const std::vector& GetVars() const { return *var_vec_; } + + std::string String() const { return tree_->String(); } + + private: + const bin_expr_t* ParseExpr() { + const bin_expr_t* expr = ParseArg(); + while (!IsEnd()) { + const char op = Symb(); + const bin_expr_t* second_arg = NULL; + if (IsSymb(')')) { + Next(); + SubClose(); + break; + } + if (IsSymb('*') || IsSymb('/')) { + Next(); + second_arg = ParseArg(); + expr = bin_expr_t::CreateExpr(expr, second_arg, op); + } else if (IsSymb('+') || IsSymb('-')) { + Next(); + second_arg = ParseExpr(); + expr = bin_expr_t::CreateExpr(expr, second_arg, op); + break; + } else { + throw exception_t("expr '" + expr_ + "', bad operator '" + op + "'"); + } + } + return expr; + } + + const bin_expr_t* ParseArg() { + const bin_expr_t* arg = NULL; + if (IsSymb('(')) { + Next(); + SubOpen(); + arg = ParseExpr(); + } else { + const unsigned pos = FindOp(); + const std::string str = CutTill(pos); + arg = bin_expr_t::CreateArg(this, str); + if (arg == NULL) throw exception_t("expr '" + expr_ + "', bad argument '" + str + "'"); + } + return arg; + } + + char Symb() const { return Symb(pos_); } + char Symb(const unsigned ind) const { return expr_[ind]; } + bool IsEnd() const { return (pos_ >= expr_.length()); } + bool IsSymb(const char c) const { return IsSymb(pos_, c); } + bool IsSymb(const unsigned ind, const char c) const { return (expr_[ind] == c); } + void Next() { ++pos_; } + void SubOpen() { ++sub_count_; } + void SubClose() { --sub_count_; } + bool SubCheck() const { return (sub_count_ == 0); } + unsigned FindOp() const { + unsigned i = pos_; + unsigned open_n = 0; + while (i < expr_.length()) { + switch (Symb(i)) { + case '*': + case '/': + case '+': + case '-': + goto end; + case '(': + ++open_n; + break; + case ')': + if (open_n != 0) i += 1; + goto end; + } + ++i; + } + end: + return i; + } + std::string CutTill(const unsigned pos) { + const std::string str = (pos > pos_) ? expr_.substr(pos_, pos - pos_) : ""; + pos_ = pos; + return str; + } + + static const bool div_zero_exc_on = false; + + const std::string expr_; + unsigned pos_; + unsigned sub_count_; + const bin_expr_t* tree_; + const expr_cache_t* const cache_; + std::vector* sub_vec_; + std::vector* var_vec_; + const bool is_sub_expr_; +}; + +class add_expr_t : public bin_expr_t { + public: + add_expr_t(const bin_expr_t* arg1, const bin_expr_t* arg2) : bin_expr_t(arg1, arg2) {} + args_t Eval(const args_cache_t& args) const { return (arg1_->Eval(args) + arg2_->Eval(args)); } + std::string Symbol() const { return "+"; } +}; +class sub_expr_t : public bin_expr_t { + public: + sub_expr_t(const bin_expr_t* arg1, const bin_expr_t* arg2) : bin_expr_t(arg1, arg2) {} + args_t Eval(const args_cache_t& args) const { return (arg1_->Eval(args) - arg2_->Eval(args)); } + std::string Symbol() const { return "-"; } +}; +class mul_expr_t : public bin_expr_t { + public: + mul_expr_t(const bin_expr_t* arg1, const bin_expr_t* arg2) : bin_expr_t(arg1, arg2) {} + args_t Eval(const args_cache_t& args) const { return (arg1_->Eval(args) * arg2_->Eval(args)); } + std::string Symbol() const { return "*"; } +}; +class div_expr_t : public bin_expr_t { + public: + div_expr_t(const bin_expr_t* arg1, const bin_expr_t* arg2) : bin_expr_t(arg1, arg2) {} + args_t Eval(const args_cache_t& args) const { + const args_t denominator = arg2_->Eval(args); + if (denominator == 0) throw div_zero_exception_t("div_expr_t::Eval()"); + return (arg1_->Eval(args) / denominator); + } + std::string Symbol() const { return "/"; } +}; +class const_expr_t : public bin_expr_t { + public: + const_expr_t(const args_t value) : value_(value) {} + args_t Eval(const args_cache_t&) const { return value_; } + std::string Symbol() const { + std::ostringstream os; + os << value_; + return os.str(); + } + + private: + const args_t value_; +}; +class var_expr_t : public bin_expr_t { + public: + var_expr_t(const std::string name) : name_(name) {} + args_t Eval(const args_cache_t& args) const { + args_t result = 0; + if (!args.Lookup(name_, result)) throw exception_t("expr arg lookup '" + name_ + "' failed"); + return result; + } + std::string Symbol() const { return name_; } + + private: + const std::string name_; +}; + +class fun_expr_t : public bin_expr_t { + public: + typedef std::vector vvect_t; + fun_expr_t(const std::string& fname, const std::string& vname, const uint32_t& vnum) : fname_(fname) { + for (uint32_t i = 0; i < vnum; ++i) { + std::ostringstream var_full_name; + var_full_name << vname << "[" << i << "]"; + vvect.push_back(var_expr_t(var_full_name.str())); + } + } + const vvect_t& GetVars() const { return vvect; } + std::string Symbol() const { + const std::string var = vvect[0].Symbol(); + const std::string vname = var.substr(0, var.length() - 3); + std::ostringstream oss; + std::string str("("); + str.back() = ')'; + oss << fname_ << "(" << vname << "," << vvect.size() << ")"; + return oss.str(); + } + + private: + const std::string fname_; + vvect_t vvect; +}; +class sum_expr_t : public fun_expr_t { + public: + sum_expr_t(const std::string& vname, const uint32_t& vnum) : fun_expr_t("sum", vname, vnum) {} + args_t Eval(const args_cache_t& args) const { + args_t result = 0; + for (const auto& var : GetVars()) result += var.Eval(args); + return result; + } +}; +class avr_expr_t : public fun_expr_t { + public: + avr_expr_t(const std::string& vname, const uint32_t& vnum) : fun_expr_t("avr", vname, vnum) {} + args_t Eval(const args_cache_t& args) const { + args_t result = 0; + for (const auto& var : GetVars()) result += var.Eval(args); + return result / GetVars().size(); + } +}; +class min_expr_t : public fun_expr_t { + public: + min_expr_t(const std::string& vname, const uint32_t& vnum) : fun_expr_t("min", vname, vnum) {} + args_t Eval(const args_cache_t& args) const { + args_t result = ARGS_MAX; + for (const auto& var : GetVars()) { + args_t val = var.Eval(args); + result = (val < result) ? val : result; + } + return result; + } +}; +class max_expr_t : public fun_expr_t { + public: + max_expr_t(const std::string& vname, const uint32_t& vnum) : fun_expr_t("max", vname, vnum) {} + args_t Eval(const args_cache_t& args) const { + args_t result = 0; + for (const auto& var : GetVars()) { + args_t val = var.Eval(args); + result = (val > result) ? val : result; + } + return result; + } +}; + +inline const bin_expr_t* bin_expr_t::CreateExpr(const bin_expr_t* arg1, const bin_expr_t* arg2, + const char op) { + const bin_expr_t* expr = NULL; + switch (op) { + case '+': + expr = new add_expr_t(arg1, arg2); + break; + case '-': + expr = new sub_expr_t(arg1, arg2); + break; + case '*': + expr = new mul_expr_t(arg1, arg2); + break; + case '/': + expr = new div_expr_t(arg1, arg2); + break; + } + return expr; +} + +inline const bin_expr_t* bin_expr_t::CreateArg(Expr* obj, const std::string str) { + const bin_expr_t* arg = NULL; + + const unsigned i = strspn(str.c_str(), "1234567890"); + if (i == str.length()) { + const unsigned value = atoi(str.c_str()); + arg = new const_expr_t(value); + } + + if (arg == NULL) { + const std::size_t pos = str.find('('); + if (pos != std::string::npos) { + char* fname = NULL; + char* vname = NULL; + int vnum = 0; + int ret = sscanf(str.c_str(), "%m[a-zA-Z_](%m[0-9a-zA-Z_],%d)", &fname, &vname, &vnum); + if (ret == 3) { + const std::string fun_name(fname); + const fun_expr_t* farg = NULL; + if (fun_name == "sum") { + farg = new sum_expr_t(vname, vnum); + } else if (fun_name == "avr") { + farg = new avr_expr_t(vname, vnum); + } else if (fun_name == "min") { + farg = new min_expr_t(vname, vnum); + } else if (fun_name == "max") { + farg = new max_expr_t(vname, vnum); + } + if (farg) for (const auto& var : farg->GetVars()) obj->AddVar(var.Symbol()); + arg = farg; + } + free(fname); + free(vname); + } + } + + if (arg == NULL) { + const std::string sub_expr = obj->Lookup(str); + if (sub_expr.empty()) { + arg = new var_expr_t(str); + obj->AddVar(str); + } else { + const Expr* expr = new Expr(sub_expr, obj); + arg = expr->GetTree(); + } + } + + return arg; +} + +} // namespace xml + +#endif // _SRC_XML_EXPR_H diff --git a/src/xml/xml.h b/src/xml/xml.h new file mode 100644 index 00000000..933cd2b6 --- /dev/null +++ b/src/xml/xml.h @@ -0,0 +1,457 @@ +/****************************************************************************** +Copyright (c) 2018 Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*******************************************************************************/ + +#ifndef SRC_XML_XML_H_ +#define SRC_XML_XML_H_ + +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +namespace xml { + +class Xml { + public: + typedef std::vector token_t; + + struct level_t; + typedef std::vector nodes_t; + typedef std::map opts_t; + struct level_t { + std::string tag; + nodes_t nodes; + opts_t opts; + }; + typedef std::vector nodes_vec_t; + typedef std::map map_t; + + enum { DECL_STATE, BODY_STATE }; + + static Xml* Create(const std::string& file_name, const Xml* obj = NULL) { + Xml* xml = new Xml(file_name, obj); + if (xml != NULL) { + if (xml->Init() == false) { + delete xml; + xml = NULL; + } else { + const std::size_t pos = file_name.rfind('/'); + const std::string path = (pos != std::string::npos) ? file_name.substr(0, pos + 1) : ""; + + xml->PreProcess(); + nodes_t incl_nodes; + for (auto* node : xml->GetNodes("top.include")) { + if (node->opts.find("touch") == node->opts.end()) { + node->opts["touch"] = ""; + incl_nodes.push_back(node); + } + } + for (auto* incl : incl_nodes) { + const std::string& incl_name = path + incl->opts["file"]; + Xml* ixml = Create(incl_name, xml); + if (ixml == NULL) { + delete xml; + xml = NULL; + break; + } else { + delete ixml; + } + } + if (xml) { + xml->Process(); + } + } + } + + return xml; + } + + static void Destroy(Xml* xml) { delete xml; } + + std::string GetName() { return file_name_; } + + void AddExpr(const std::string& full_tag, const std::string& name, const std::string& expr) { + const std::size_t pos = full_tag.rfind('.'); + const std::size_t pos1 = (pos == std::string::npos) ? 0 : pos + 1; + const std::string level_tag = full_tag.substr(pos1); + level_t* level = new level_t; + (*map_)[full_tag].push_back(level); + level->tag = level_tag; + level->opts["name"] = name; + level->opts["expr"] = expr; + } + + void AddConst(const std::string& full_tag, const std::string& name, const uint64_t& val) { + std::ostringstream oss; + oss << val; + AddExpr(full_tag, name, oss.str()); + } + + nodes_t GetNodes(const std::string& global_tag) { return (*map_)[global_tag]; } + + template F ForEach(const F& f_i) { + F f = f_i; + if (map_) { + for (auto& entry : *map_) { + for (auto node : entry.second) { + if (f.fun(entry.first, node) == false) break; + } + } + } + return f; + } + + template F ForEach(const F& f_i) const { + F f = f_i; + if (map_) { + for (auto& entry : *map_) { + for (auto node : entry.second) { + if (f.fun(entry.first, node) == false) break; + } + } + } + return f; + } + + struct print_func { + bool fun(const std::string& global_tag, level_t* node) { + for (auto& opt : node->opts) { + std::cout << global_tag << "." << opt.first << " = " << opt.second << std::endl; + } + return true; + } + }; + + void Print() const { + std::cout << "XML file '" << file_name_ << "':" << std::endl; + ForEach(print_func()); + } + + private: + Xml(const std::string& file_name, const Xml* obj) + : file_name_(file_name), + file_line_(0), + data_size_(0), + index_(0), + state_(BODY_STATE), + comment_(false), + included_(false), + level_(NULL), + map_(NULL) { + if (obj != NULL) { + map_ = obj->map_; + level_ = obj->level_; + included_ = true; + } + } + + struct delete_func { + bool fun(const std::string&, level_t* node) { + delete node; + return true; + } + }; + + ~Xml() { + if (included_ == false) { + ForEach(delete_func()); + delete map_; + } + } + + bool Init() { + fd_ = open(file_name_.c_str(), O_RDONLY); + if (fd_ == -1) { + // perror((std::string("open XML file ") + file_name_).c_str()); + return false; + } + + if (map_ == NULL) { + map_ = new map_t; + if (map_ == NULL) return false; + AddLevel("top"); + } + + return true; + } + + void PreProcess() { + uint32_t ind = 0; + char buf[kBufSize]; + bool error = false; + + while (1) { + const uint32_t pos = lseek(fd_, 0, SEEK_CUR); + uint32_t size = read(fd_, buf, kBufSize); + if (size <= 0) break; + buf[size - 1] = '\0'; + + if (strncmp(buf, "#include \"", 10) == 0) { + for (ind = 0; (ind < size) && (buf[ind] != '\n'); ++ind) {} + if (ind == size) { + fprintf(stderr, "XML PreProcess failed, line size limit %zu\n", kBufSize); + error = true; + break; + } + buf[ind] = '\0'; + size = ind; + lseek(fd_, pos + ind + 1, SEEK_SET); + + for (ind = 10; (ind < size) && (buf[ind] != '"'); ++ind) {} + if (ind == size) { + error = true; + break; + } + buf[ind] = '\0'; + + AddLevel("include"); + AddOption("file", &buf[10]); + UpLevel(); + } + } + + if (error) { + fprintf(stderr, "XML PreProcess failed, line '%s'\n", buf); + exit(1); + } + + lseek(fd_, 0, SEEK_SET); + } + + void Process() { + token_t remainder; + + while (1) { + token_t token = (remainder.size()) ? remainder : NextToken(); + remainder.clear(); + + // token_t token1 = token; + // token1.push_back('\0'); + // std::cout << "> " << &token1[0] << std::endl; + + // End of file + if (token.size() == 0) break; + + switch (state_) { + case BODY_STATE: + if (token[0] == '<') { + bool node_begin = true; + unsigned ind = 1; + if (token[1] == '/') { + node_begin = false; + ++ind; + } + + unsigned i = ind; + while (i < token.size()) { + if (token[i] == '>') break; + ++i; + } + for (unsigned j = i + 1; j < token.size(); ++j) remainder.push_back(token[j]); + + if (i == token.size()) { + if (node_begin) + state_ = DECL_STATE; + else + BadFormat(token); + token.push_back('\0'); + } else { + token[i] = '\0'; + } + + const char* tag = &token[ind]; + if (node_begin) { + AddLevel(tag); + } else { + if (strncmp(CurrentLevel().c_str(), tag, strlen(tag)) != 0) { + token.back() = '>'; + BadFormat(token); + } + UpLevel(); + } + } else { + BadFormat(token); + } + break; + case DECL_STATE: + if (token[0] == '>') { + state_ = BODY_STATE; + for (unsigned j = 1; j < token.size(); ++j) remainder.push_back(token[j]); + continue; + } else { + token.push_back('\0'); + unsigned j = 0; + for (j = 0; j < token.size(); ++j) + if (token[j] == '=') break; + if (j == token.size()) BadFormat(token); + token[j] = '\0'; + const char* key = &token[0]; + const char* value = &token[j + 1]; + AddOption(key, value); + } + break; + default: + std::cout << "XML parser error: wrong state: " << state_ << std::endl; + exit(1); + } + } + } + + bool SpaceCheck() const { + bool cond = ((buffer_[index_] == ' ') || (buffer_[index_] == '\t')); + return cond; + } + + bool LineEndCheck() { + bool found = false; + if (buffer_[index_] == '\n') { + buffer_[index_] = ' '; + ++file_line_; + found = true; + comment_ = false; + } else if (comment_ || (buffer_[index_] == '#')) { + found = true; + comment_ = true; + } + return found; + } + + token_t NextToken() { + token_t token; + bool in_string = false; + bool special_symb = false; + + while (1) { + if (data_size_ == 0) { + data_size_ = read(fd_, buffer_, kBufSize); + if (data_size_ <= 0) break; + } + + if (token.empty()) { + while ((index_ < data_size_) && (SpaceCheck() || LineEndCheck())) { + ++index_; + } + } + while ((index_ < data_size_) && (in_string || !(SpaceCheck() || LineEndCheck()))) { + const char symb = buffer_[index_]; + bool skip_symb = false; + + switch (symb) { + case '\\': + if (special_symb) { + special_symb = false; + } else { + special_symb = true; + skip_symb = true; + } + break; + case '"': + if (special_symb) { + special_symb = false; + } else { + in_string = !in_string; + if (!in_string) { + buffer_[index_] = ' '; + --index_; + } + skip_symb = true; + } + break; + } + + if (!skip_symb) token.push_back(symb); + ++index_; + } + + if (index_ == data_size_) { + index_ = 0; + data_size_ = 0; + } else { + if (special_symb || in_string) BadFormat(token); + break; + } + } + + return token; + } + + void BadFormat(token_t token) { + token.push_back('\0'); + std::cout << "Error: " << file_name_ << ", line " << file_line_ << ", bad XML token '" + << &token[0] << "'" << std::endl; + exit(1); + } + + void AddLevel(const std::string& tag) { + level_t* level = new level_t; + level->tag = tag; + if (level_) { + level_->nodes.push_back(level); + stack_.push_back(level_); + } + level_ = level; + + std::string global_tag; + for (level_t* level : stack_) { + global_tag += level->tag + "."; + } + global_tag += tag; + (*map_)[global_tag].push_back(level_); + } + + void UpLevel() { + level_ = stack_.back(); + stack_.pop_back(); + } + + std::string CurrentLevel() const { return level_->tag; } + + void AddOption(const std::string& key, const std::string& value) { level_->opts[key] = value; } + + const std::string file_name_; + unsigned file_line_; + int fd_; + + static const size_t kBufSize = 256; + char buffer_[kBufSize]; + + unsigned data_size_; + unsigned index_; + unsigned state_; + bool comment_; + std::vector stack_; + bool included_; + level_t* level_; + map_t* map_; +}; + +} // namespace xml + +#endif // SRC_XML_XML_H_ diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt new file mode 100644 index 00000000..278bc5c4 --- /dev/null +++ b/test/CMakeLists.txt @@ -0,0 +1,62 @@ +################################################################################ +# Copyright (c) 2018 Advanced Micro Devices, Inc. All rights reserved. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +# THE SOFTWARE. +################################################################################ + +cmake_minimum_required ( VERSION 3.5.0 ) +set ( CMAKE_VERBOSE_MAKEFILE TRUE CACHE BOOL "Verbose Output" FORCE ) + +set ( EXE_NAME "ctrl" ) + +if ( NOT DEFINED TEST_DIR ) + set ( TEST_DIR ${CMAKE_CURRENT_SOURCE_DIR} ) + project ( ${EXE_NAME} ) + ## Set build environment + include ( env ) +endif () + +## Util sources +file( GLOB UTIL_SRC "${TEST_DIR}/util/*.cpp" ) + +## Test control sources +set ( CTRL_SRC + ${TEST_DIR}/app/test.cpp + ${TEST_DIR}/ctrl/test_hsa.cpp +) + +## Test kernel sources +set ( TEST_NAME simple_convolution ) +set ( KERN_SRC ${TEST_DIR}/${TEST_NAME}/${TEST_NAME}.cpp ) +execute_process ( COMMAND sh -xc "cp ${TEST_DIR}/${TEST_NAME}/*.hsaco ${PROJECT_BINARY_DIR}" ) + +## Building test executable +add_executable ( ${EXE_NAME} ${KERN_SRC} ${CTRL_SRC} ${UTIL_SRC} ) +target_include_directories ( ${EXE_NAME} PRIVATE ${TEST_DIR} ${ROOT_DIR} ${HSA_RUNTIME_INC_PATH} ) +target_link_libraries( ${EXE_NAME} ${HSA_RUNTIME_LIB} ${HSA_KMT_LIB} c stdc++ dl pthread rt atomic ) +execute_process ( COMMAND sh -xc "cp ${TEST_DIR}/run.sh ${PROJECT_BINARY_DIR}" ) +execute_process ( COMMAND sh -xc "cp ${TEST_DIR}/tool/*.xml ${PROJECT_BINARY_DIR}" ) +execute_process ( COMMAND sh -xc "mkdir -p ${PROJECT_BINARY_DIR}/RESULTS" ) + +## Build test library +set ( TEST_LIB "tool" ) +set ( TEST_LIB_SRC ${TEST_DIR}/tool/tool.cpp ${UTIL_SRC} ) +add_library ( ${TEST_LIB} SHARED ${TEST_LIB_SRC} ) +target_include_directories ( ${TEST_LIB} PRIVATE ${TEST_DIR} ${ROOT_DIR} ${HSA_RUNTIME_INC_PATH} ) +target_link_libraries ( ${TEST_LIB} ${ROCPROFILER_TARGET} ${HSA_RUNTIME_LIB} c stdc++ dl pthread rt atomic ) diff --git a/test/app/test.cpp b/test/app/test.cpp new file mode 100644 index 00000000..9e694833 --- /dev/null +++ b/test/app/test.cpp @@ -0,0 +1,40 @@ +/****************************************************************************** +Copyright (c) 2018 Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*******************************************************************************/ + +#include +#include +#include + +#include "ctrl/run_kernel.h" +#include "ctrl/test_aql.h" +#include "simple_convolution/simple_convolution.h" + +int main(int argc, char** argv) { + const char* kiter_s = getenv("ROCP_KITER"); + const char* diter_s = getenv("ROCP_DITER"); + const int kiter = (kiter_s != NULL) ? atol(kiter_s) : 1; + const int diter = (diter_s != NULL) ? atol(diter_s) : 1; + TestHsa::HsaInstantiate(); + for (int i = 0; i < kiter; ++i) RunKernel(argc, argv, diter); + TestHsa::HsaShutdown(); + return 0; +} diff --git a/test/ctrl/run_kernel.h b/test/ctrl/run_kernel.h new file mode 100644 index 00000000..b122664b --- /dev/null +++ b/test/ctrl/run_kernel.h @@ -0,0 +1,83 @@ +/****************************************************************************** +Copyright (c) 2018 Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*******************************************************************************/ + +#ifndef TEST_CTRL_RUN_KERNEL_H_ +#define TEST_CTRL_RUN_KERNEL_H_ + +#include "ctrl/test_hsa.h" +#include "util/test_assert.h" + +template bool RunKernel(int argc, char* argv[], int count = 1) { + bool ret_val = false; + + // Create test kernel object + Kernel test_kernel; + TestAql* test_aql = new TestHsa(&test_kernel); + test_aql = new Test(test_aql); + TEST_ASSERT(test_aql != NULL); + if (test_aql == NULL) return 1; + + // Initialization of Hsa Runtime + ret_val = test_aql->Initialize(argc, argv); + if (ret_val == false) { + std::cerr << "Error in the test initialization" << std::endl; + // TEST_ASSERT(ret_val); + return false; + } + + // Setup Hsa resources needed for execution + ret_val = test_aql->Setup(); + if (ret_val == false) { + std::cerr << "Error in creating hsa resources" << std::endl; + TEST_ASSERT(ret_val); + return false; + } + + // Kernel dspatch iterations + for (int i = 0; i < count; ++i) { + // Run test kernel + ret_val = test_aql->Run(); + if (ret_val == false) { + std::cerr << "Error in running the test kernel" << std::endl; + TEST_ASSERT(ret_val); + return false; + } + + // Verify the results of the execution + ret_val = test_aql->VerifyResults(); + if (ret_val) { + std::clog << "Test : Passed" << std::endl; + } else { + std::clog << "Test : Failed" << std::endl; + } + } + + // Print time taken by sample + test_aql->PrintTime(); + + test_aql->Cleanup(); + delete test_aql; + + return ret_val; +} + +#endif // TEST_CTRL_RUN_KERNEL_H_ diff --git a/test/ctrl/test_aql.h b/test/ctrl/test_aql.h new file mode 100644 index 00000000..d77363ee --- /dev/null +++ b/test/ctrl/test_aql.h @@ -0,0 +1,77 @@ +/****************************************************************************** +Copyright (c) 2018 Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*******************************************************************************/ + +#ifndef TEST_CTRL_TEST_AQL_H_ +#define TEST_CTRL_TEST_AQL_H_ + +#include +#include + +#include "util/hsa_rsrc_factory.h" + +// Test AQL interface +class TestAql { + public: + explicit TestAql(TestAql* t = 0) : test_(t) {} + virtual ~TestAql() { + if (test_) delete test_; + } + + TestAql* Test() { return test_; } + virtual const AgentInfo* GetAgentInfo() { return (test_) ? test_->GetAgentInfo() : 0; } + virtual hsa_queue_t* GetQueue() { return (test_) ? test_->GetQueue() : 0; } + virtual HsaRsrcFactory* GetRsrcFactory() { return (test_) ? test_->GetRsrcFactory() : 0; } + + // Initialize application environment including setting + // up of various configuration parameters based on + // command line arguments + // @return bool true on success and false on failure + virtual bool Initialize(int argc, char** argv) { + return (test_) ? test_->Initialize(argc, argv) : true; + } + + // Setup application parameters for exectuion + // @return bool true on success and false on failure + virtual bool Setup() { return (test_) ? test_->Setup() : true; } + + // Run the kernel + // @return bool true on success and false on failure + virtual bool Run() { return (test_) ? test_->Run() : true; } + + // Verify results + // @return bool true on success and false on failure + virtual bool VerifyResults() { return (test_) ? test_->VerifyResults() : true; } + + // Print to console the time taken to execute kernel + virtual void PrintTime() { + if (test_) test_->PrintTime(); + } + + // Release resources e.g. memory allocations + // @return bool true on success and false on failure + virtual bool Cleanup() { return (test_) ? test_->Cleanup() : true; } + + private: + TestAql* const test_; +}; + +#endif // TEST_CTRL_TEST_AQL_H_ diff --git a/test/ctrl/test_hsa.cpp b/test/ctrl/test_hsa.cpp new file mode 100644 index 00000000..87861821 --- /dev/null +++ b/test/ctrl/test_hsa.cpp @@ -0,0 +1,283 @@ +/****************************************************************************** +Copyright (c) 2018 Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*******************************************************************************/ + +#include "ctrl/test_hsa.h" + +#include + +#include "util/test_assert.h" +#include "util/helper_funcs.h" +#include "util/hsa_rsrc_factory.h" + +HsaRsrcFactory* TestHsa::hsa_rsrc_ = NULL; +const AgentInfo* TestHsa::agent_info_ = NULL; +hsa_queue_t* TestHsa::hsa_queue_ = NULL; +uint32_t TestHsa::agent_id_ = 0; + +HsaRsrcFactory* TestHsa::HsaInstantiate(const uint32_t agent_ind) { + // Instantiate an instance of Hsa Resources Factory + if (hsa_rsrc_ == NULL) { + agent_id_ = agent_ind; + + hsa_rsrc_ = HsaRsrcFactory::Create(); + + // Print properties of the agents + hsa_rsrc_->PrintGpuAgents("> GPU agents"); + + // Create an instance of Gpu agent + if (!hsa_rsrc_->GetGpuAgentInfo(agent_ind, &agent_info_)) { + agent_info_ = NULL; + std::cerr << "> error: agent[" << agent_ind << "] is not found" << std::endl; + return NULL; + } + std::clog << "> Using agent[" << agent_ind << "] : " << agent_info_->name << std::endl; + + // Create an instance of Aql Queue + if (hsa_queue_ == NULL) { + uint32_t num_pkts = 128; + if (hsa_rsrc_->CreateQueue(agent_info_, num_pkts, &hsa_queue_) == false) { + hsa_queue_ = NULL; + TEST_ASSERT(false); + } + } + } + return hsa_rsrc_; +} + +void TestHsa::HsaShutdown() { + if (hsa_queue_ != NULL) { + hsa_queue_destroy(hsa_queue_); + hsa_queue_ = NULL; + } + if (hsa_rsrc_) hsa_rsrc_->Destroy(); +} + +bool TestHsa::Initialize(int arg_cnt, char** arg_list) { + std::clog << "TestHsa::Initialize :" << std::endl; + + // Instantiate a Timer object + setup_timer_idx_ = hsa_timer_.CreateTimer(); + dispatch_timer_idx_ = hsa_timer_.CreateTimer(); + + if (HsaInstantiate(agent_id_) == NULL) { + TEST_ASSERT(false); + return false; + } + + // Obtain handle of signal + hsa_rsrc_->CreateSignal(1, &hsa_signal_); + + // Obtain the code object file name + std::string agentName(agent_info_->name); + if (agentName.compare(0, 4, "gfx8") == 0) { + brig_path_obj_.append("gfx8"); + } else if (agentName.compare(0, 4, "gfx9") == 0) { + brig_path_obj_.append("gfx9"); + } else { + TEST_ASSERT(false); + return false; + } + brig_path_obj_.append("_" + name_ + ".hsaco"); + + return true; +} + +bool TestHsa::Setup() { + std::clog << "TestHsa::setup :" << std::endl; + + // Start the timer object + hsa_timer_.StartTimer(setup_timer_idx_); + + // Load and Finalize Kernel Code Descriptor + const char* brig_path = brig_path_obj_.c_str(); + bool suc = hsa_rsrc_->LoadAndFinalize(agent_info_, brig_path, name_.c_str(), &hsa_exec_, + &kernel_code_desc_); + if (suc == false) { + std::cerr << "Error in loading and finalizing Kernel" << std::endl; + return false; + } + + mem_map_t& mem_map = test_->GetMemMap(); + for (mem_it_t it = mem_map.begin(); it != mem_map.end(); ++it) { + mem_descr_t& des = it->second; + switch (des.id) { + case TestKernel::LOCAL_DES_ID: + des.ptr = hsa_rsrc_->AllocateLocalMemory(agent_info_, des.size); + break; + case TestKernel::KERNARG_DES_ID: { + // Check the kernel args size + const size_t kernarg_size = des.size; + size_t size_info = 0; + hsa_executable_symbol_get_info( + kernel_code_desc_, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_KERNARG_SEGMENT_SIZE, &size_info); + const bool kernarg_missmatch = (kernarg_size > size_info); + if (kernarg_missmatch) { + std::cout << "kernarg_size = " << kernarg_size << ", size_info = " << size_info + << std::flush << std::endl; + TEST_ASSERT(!kernarg_missmatch); + break; + } + // ALlocate kernarg memory + des.size = size_info; + des.ptr = hsa_rsrc_->AllocateKernArgMemory(agent_info_, size_info); + if (des.ptr) memset(des.ptr, 0, size_info); + break; + } + case TestKernel::SYS_DES_ID: + des.ptr = hsa_rsrc_->AllocateSysMemory(agent_info_, des.size); + if (des.ptr) memset(des.ptr, 0, des.size); + break; + case TestKernel::NULL_DES_ID: + des.ptr = NULL; + break; + default: + break; + } + TEST_ASSERT(des.ptr != NULL); + if (des.ptr == NULL) return false; + } + test_->Init(); + + // Stop the timer object + hsa_timer_.StopTimer(setup_timer_idx_); + setup_time_taken_ = hsa_timer_.ReadTimer(setup_timer_idx_); + total_time_taken_ = setup_time_taken_; + + return true; +} + +bool TestHsa::Run() { + std::clog << "TestHsa::run :" << std::endl; + + const uint32_t work_group_size = 64; + const uint32_t work_grid_size = test_->GetGridSize(); + uint32_t group_segment_size = 0; + uint32_t private_segment_size = 0; + uint64_t code_handle = 0; + + // Retrieve the amount of group memory needed + hsa_executable_symbol_get_info( + kernel_code_desc_, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_GROUP_SEGMENT_SIZE, &group_segment_size); + + // Retrieve the amount of private memory needed + hsa_executable_symbol_get_info(kernel_code_desc_, + HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_PRIVATE_SEGMENT_SIZE, + &private_segment_size); + + + // Retrieve handle of the code block + hsa_executable_symbol_get_info(kernel_code_desc_, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT, + &code_handle); + + // Initialize the dispatch packet. + hsa_kernel_dispatch_packet_t aql; + memset(&aql, 0, sizeof(aql)); + // Set the packet's type, barrier bit, acquire and release fences + aql.header = HSA_PACKET_TYPE_KERNEL_DISPATCH; + aql.header |= HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_SCACQUIRE_FENCE_SCOPE; + aql.header |= HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_SCRELEASE_FENCE_SCOPE; + // Populate Aql packet with default values + aql.setup = 1; + aql.grid_size_x = work_grid_size; + aql.grid_size_y = 1; + aql.grid_size_z = 1; + aql.workgroup_size_x = work_group_size; + aql.workgroup_size_y = 1; + aql.workgroup_size_z = 1; + // Bind the kernel code descriptor and arguments + aql.kernel_object = code_handle; + aql.kernarg_address = test_->GetKernargPtr(); + aql.group_segment_size = group_segment_size; + aql.private_segment_size = private_segment_size; + // Initialize Aql packet with handle of signal + hsa_signal_store_relaxed(hsa_signal_, 1); + aql.completion_signal = hsa_signal_; + + std::clog << "> Executing kernel: \"" << name_ << "\"" << std::endl; + + // Start the timer object + hsa_timer_.StartTimer(dispatch_timer_idx_); + + // Submit AQL packet to the queue + const uint64_t que_idx = hsa_rsrc_->Submit(hsa_queue_, &aql); + + std::clog << "> Waiting on kernel dispatch signal, que_idx=" << que_idx << std::endl; + + // Wait on the dispatch signal until the kernel is finished. + // Update wait condition to HSA_WAIT_STATE_ACTIVE for Polling + if (hsa_signal_wait_scacquire(hsa_signal_, HSA_SIGNAL_CONDITION_LT, 1, UINT64_MAX, + HSA_WAIT_STATE_BLOCKED) != 0) { + TEST_ASSERT("signal_wait failed"); + } + + std::clog << "> DONE, que_idx=" << que_idx << std::endl; + + // Stop the timer object + hsa_timer_.StopTimer(dispatch_timer_idx_); + dispatch_time_taken_ = hsa_timer_.ReadTimer(dispatch_timer_idx_); + total_time_taken_ += dispatch_time_taken_; + + return true; +} + +bool TestHsa::VerifyResults() { + bool cmp = false; + void* output = NULL; + const uint32_t size = test_->GetOutputSize(); + bool suc = false; + + // Copy local kernel output buffers from local memory into host memory + if (test_->IsOutputLocal()) { + output = hsa_rsrc_->AllocateSysMemory(agent_info_, size); + suc = hsa_rsrc_->Memcpy(agent_info_, output, test_->GetOutputPtr(), size); + if (!suc) std::clog << "> VerifyResults: Memcpy failed" << std::endl << std::flush; + } else { + output = test_->GetOutputPtr(); + suc = true; + } + + if ((output != NULL) && suc) { + // Print the test output + test_->PrintOutput(output); + // Compare the results and see if they match + cmp = (memcmp(output, test_->GetRefOut(), size) == 0); + } + + if (test_->IsOutputLocal() && (output != NULL)) hsa_rsrc_->FreeMemory(output); + + return cmp; +} + +void TestHsa::PrintTime() { + std::clog << "Time taken for Setup by " << this->name_ << " : " << this->setup_time_taken_ + << std::endl; + std::clog << "Time taken for Dispatch by " << this->name_ << " : " << this->dispatch_time_taken_ + << std::endl; + std::clog << "Time taken in Total by " << this->name_ << " : " << this->total_time_taken_ + << std::endl; +} + +bool TestHsa::Cleanup() { + hsa_executable_destroy(hsa_exec_); + hsa_signal_destroy(hsa_signal_); + return true; +} diff --git a/test/ctrl/test_hsa.h b/test/ctrl/test_hsa.h new file mode 100644 index 00000000..84080e77 --- /dev/null +++ b/test/ctrl/test_hsa.h @@ -0,0 +1,124 @@ +/****************************************************************************** +Copyright (c) 2018 Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*******************************************************************************/ + +#ifndef TEST_CTRL_TEST_HSA_H_ +#define TEST_CTRL_TEST_HSA_H_ + +#include "ctrl/test_aql.h" +#include "ctrl/test_kernel.h" +#include "util/hsa_rsrc_factory.h" +#include "util/perf_timer.h" + +// Class implements HSA test +class TestHsa : public TestAql { + public: + // Instantiate HSA resources + static HsaRsrcFactory* HsaInstantiate(const uint32_t agent_ind = agent_id_); + static void HsaShutdown(); + static void SetQueue(hsa_queue_t* queue) { hsa_queue_ = queue; } + static uint32_t HsaAgentId() { return agent_id_; } + + // Constructor + explicit TestHsa(TestKernel* test) : test_(test), name_(test->Name()) { + total_time_taken_ = 0; + setup_time_taken_ = 0; + dispatch_time_taken_ = 0; + hsa_exec_ = {}; + } + + // Get methods for Agent Info, HAS queue, HSA Resourcse Manager + const AgentInfo* GetAgentInfo() { return agent_info_; } + hsa_queue_t* GetQueue() { return hsa_queue_; } + HsaRsrcFactory* GetRsrcFactory() { return hsa_rsrc_; } + + // Initialize application environment including setting + // up of various configuration parameters based on + // command line arguments + // @return bool true on success and false on failure + bool Initialize(int argc, char** argv); + + // Setup application parameters for exectuion + // @return bool true on success and false on failure + bool Setup(); + + // Run the BinarySearch kernel + // @return bool true on success and false on failure + bool Run(); + + // Verify against reference implementation + // @return bool true on success and false on failure + bool VerifyResults(); + + // Print to console the time taken to execute kernel + void PrintTime(); + + // Release resources e.g. memory allocations + // @return bool true on success and false on failure + bool Cleanup(); + + private: + typedef TestKernel::mem_descr_t mem_descr_t; + typedef TestKernel::mem_map_t mem_map_t; + typedef TestKernel::mem_it_t mem_it_t; + + // Test object + TestKernel* test_; + + // Path of Brig file + std::string brig_path_obj_; + + // Used to track time taken to run the sample + double total_time_taken_; + double setup_time_taken_; + double dispatch_time_taken_; + + // Handle of signal + hsa_signal_t hsa_signal_; + + // Handle of Kernel Code Descriptor + hsa_executable_symbol_t kernel_code_desc_; + + // Instance of timer object + uint32_t setup_timer_idx_; + uint32_t dispatch_timer_idx_; + PerfTimer hsa_timer_; + + // Instance of Hsa Resources Factory + static HsaRsrcFactory* hsa_rsrc_; + + // GPU id + static uint32_t agent_id_; + + // Handle to an Hsa Gpu Agent + static const AgentInfo* agent_info_; + + // Handle to an Hsa Queue + static hsa_queue_t* hsa_queue_; + + // Test kernel name + std::string name_; + + // Kernel executable + hsa_executable_t hsa_exec_; +}; + +#endif // TEST_CTRL_TEST_HSA_H_ diff --git a/test/ctrl/test_kernel.h b/test/ctrl/test_kernel.h new file mode 100644 index 00000000..95da162c --- /dev/null +++ b/test/ctrl/test_kernel.h @@ -0,0 +1,134 @@ +/****************************************************************************** +Copyright (c) 2018 Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*******************************************************************************/ + +#ifndef TEST_CTRL_TEST_KERNEL_H_ +#define TEST_CTRL_TEST_KERNEL_H_ + +#include +#include +#include + +// Class implements kernel test +class TestKernel { + public: + // Exported buffers IDs + enum buf_id_t { KERNARG_EXP_ID, OUTPUT_EXP_ID, REFOUT_EXP_ID }; + // Memory descriptors IDs + enum des_id_t { NULL_DES_ID, LOCAL_DES_ID, KERNARG_DES_ID, SYS_DES_ID, REFOUT_DES_ID }; + + // Memory descriptors vector declaration + struct mem_descr_t { + des_id_t id; + void* ptr; + uint32_t size; + }; + + // Memory map declaration + typedef std::map mem_map_t; + typedef mem_map_t::iterator mem_it_t; + typedef mem_map_t::const_iterator mem_const_it_t; + + virtual ~TestKernel() {} + + // Initialize method + virtual void Init() = 0; + + // Return kernel memory map + mem_map_t& GetMemMap() { return mem_map_; } + + // Return NULL descriptor + static mem_descr_t NullDescriptor() { return {NULL_DES_ID, NULL, 0}; } + + // Check if decripter is local + bool IsLocal(const mem_descr_t& descr) const { return (descr.id == LOCAL_DES_ID); } + + // Methods to get the kernel attributes + const mem_descr_t& GetKernargDescr() { return *test_map_[KERNARG_EXP_ID]; } + const mem_descr_t& GetOutputDescr() { return *test_map_[OUTPUT_EXP_ID]; } + void* GetKernargPtr() { return GetKernargDescr().ptr; } + uint32_t GetKernargSize() { return GetKernargDescr().size; } + void* GetOutputPtr() { return GetOutputDescr().ptr; } + uint32_t GetOutputSize() { return GetOutputDescr().size; } + bool IsOutputLocal() { return IsLocal(GetOutputDescr()); } + virtual uint32_t GetGridSize() const = 0; + + // Return reference output + void* GetRefOut() { return test_map_[REFOUT_EXP_ID]->ptr; } + + // Print output + virtual void PrintOutput(const void* ptr) const = 0; + + // Return name + virtual std::string Name() const = 0; + + protected: + // Set buffer descriptor + bool SetInDescr(const uint32_t& buf_id, const des_id_t& des_id, const uint32_t& size) { + bool suc = SetMemDescr(buf_id, des_id, size); + if (des_id == KERNARG_DES_ID) { + test_map_[KERNARG_EXP_ID] = &mem_map_[buf_id]; + } + return suc; + } + + // Set results descriptor + bool SetOutDescr(const uint32_t& buf_id, const des_id_t& des_id, const uint32_t& size) { + bool suc = SetMemDescr(buf_id, des_id, size); + test_map_[OUTPUT_EXP_ID] = &mem_map_[buf_id]; + return suc; + } + + // Set host descriptor + bool SetHostDescr(const uint32_t& buf_id, const des_id_t& des_id, const uint32_t& size) { + bool suc = SetMemDescr(buf_id, des_id, size); + if (suc) { + mem_descr_t& descr = mem_map_[buf_id]; + descr.ptr = malloc(size); + if (des_id == REFOUT_DES_ID) { + test_map_[REFOUT_EXP_ID] = &descr; + } + if (descr.ptr == NULL) suc = false; + } + return suc; + } + + // Get memory descriptor + mem_descr_t GetDescr(const uint32_t& buf_id) const { + mem_const_it_t it = mem_map_.find(buf_id); + return (it != mem_map_.end()) ? it->second : NullDescriptor(); + } + + private: + // Set memory descriptor + bool SetMemDescr(const uint32_t& buf_id, const des_id_t& des_id, const uint32_t& size) { + const mem_descr_t des = {des_id, NULL, size}; + auto ret = mem_map_.insert(mem_map_t::value_type(buf_id, des)); + return ret.second; + } + + // Kernel memory map object + mem_map_t mem_map_; + // Test memory map object + std::map test_map_; +}; + +#endif // TEST_CTRL_TEST_KERNEL_H_ diff --git a/test/run.sh b/test/run.sh new file mode 100755 index 00000000..037b47a2 --- /dev/null +++ b/test/run.sh @@ -0,0 +1,61 @@ +################################################################################ +# Copyright (c) 2018 Advanced Micro Devices, Inc. All rights reserved. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +# THE SOFTWARE. +################################################################################ +#!/bin/sh + +test_bin_dflt=./test/ctrl + +# paths to ROC profiler and oher libraries +export LD_LIBRARY_PATH=$PWD +# enable error messages logging to '/tmp/rocprofiler_log.txt' +export ROCPROFILER_LOG=1 + +# ROC profiler library loaded by HSA runtime +export HSA_TOOLS_LIB=librocprofiler64.so +# tool library loaded by ROC profiler +export ROCP_TOOL_LIB=libtool.so +# ROC profiler metrics config file +unset ROCP_PROXY_QUEUE +# ROC profiler metrics config file +export ROCP_METRICS=metrics.xml +# output directory for the tool library, for metrics results file 'results.txt' +export ROCP_OUTPUT_DIR=./RESULTS + +if [ ! -e $ROCP_TOOL_LIB ] ; then + export ROCP_TOOL_LIB=test/libtool.so +fi + +if [ -n "$1" ] ; then + tbin="$*" +else + tbin=$test_bin_dflt +fi + +export ROCP_KITER=100 +export ROCP_DITER=100 +export ROCP_INPUT=input.xml +eval $tbin + +#valgrind --leak-check=full $tbin +#valgrind --tool=massif $tbin +#ms_print massif.out. + +exit 0 diff --git a/test/simple_convolution/gfx8_SimpleConvolution.hsaco b/test/simple_convolution/gfx8_SimpleConvolution.hsaco new file mode 100644 index 0000000000000000000000000000000000000000..831484c2267528bc7ff55923b2294aab223b1cef GIT binary patch literal 9392 zcmeGiO>YxN^zC?e?W`RqgfvZQDT_)=RS_#Ju8B}9B_wS?1QH0JMHMmeCK&ux$Bvpx zS)A~ZAR#~qA&$LPs?QJLZU<2(bU+B(Fw}vLW0lhczf&OO3RvLA03;5% zbRv`9^WghB!I40h!g+J=9<$y2+&E$vb9QE|$}vo{ z%{-RO+Eo%N4W_e&jNP5jjpsAv5}HCW)9g#;M$5_3s%U?~&UN>h-RroX^Yf-9oRV%= zadeGWByb&aKA8ThYLP@cTox*2(xYo;v6gi@nJL>x(z%q3yEl`+n9Lk3kBrzw&j6P? zIZ^N&oZ4r3%&y_#QjsJ({(1CGp!F zZ+DqzlH>MBF`p|PD`y9L2Tq`}B}?#3zIY{Fva`rKEm(W^Z)(&1QJ2fmTg;aWuDcr6 zKrua@EZLhgy;$V)BVoj1F?tFm;z3J=lEi57_%ujn&-uu-qw7`}%BRn6fs=kV_j`%Mb zsMn#QRg`I9%u$%?033RPDPRFbUueY`uQ7eWL>vg{3pB@Tpf4bT3d&!F+w*EfcZ|m{ z1yj+xD&jz|?f?rptGbRGfl7a%L3cE`xg5az3>|K84DiEZ#b8j07;)SQEa_o}JA#|b zS`cwCVrfA=fh(~TsAo`UxINEMFQnq`Km#iRU@@|%Fx)TPSZ4g5*lY6rxcz5sm7Wsy~_}{b*G1D`Y3A z0}DQ!ylhtH9GIQW!Yt06Ig}T1FHo7khB>2cjnd1URmS-Il?LXh7_WkHY8WqJ?mL9@ zH$ri|XIRqe&_2qj0mK2rLY%-EEGd{5g>-SQVV>jgUDdTQ#SH5)i&y|SchXlv2{rU^ z5@X@>B)gUAZUVfN zW2&jistZ4wyxequ<_nA=THoSOeHgE^Jxs>5dX)7)#*^5dfy++?eBH-f7a!@}NLA2qFKir3j1bjUzlc0I=$^wiMBP=kRQ z`WvqNFE#WXF8x^zeLv9|X3gXRUh^*YHoMn{ST0|(A(ooRjZI`Bb}2b_NsLNGh?VRw zOMH~friURmoX^rL4~U^?DR~j`rLj^`^aO^32iq^~<*mireY}nHHo@Bt*!areLeIHl zT}Kahd*Qxtht`8~&yR;>MRr^)|6f2X0{K4!F%r5|V^U4loy4*lNRNDpy9EGQj}1tp znw0vo?j)9Sk}v&BjK^{Qp9aa7buDqta`PBa#e?nGqqyYvM!v_9*p!Z>1_`f$cd$>o zc)ugST#tv`p)@1sPxmSWd+ zdjhmnL21<+Yv-H!&G&utn{Q^;v%7cZ)jd0@rgagKeB>d~upHn(7Z=3a$O|uk=ptca zLT(X1F`%w9jJJ1j0j)euGiQWIX$l-+w0m5-?B^{~N1R5$emU$lvpBAoi!VpY{A>O8XhQz)Mhs zQYoPx=@}YHCt^EN$&;zXXm%)-Bz=7&v624t>As<4HkL_75`B@x&|oqaMRn{N@ZTV`+( zLjaf!5%f z6#mbKHjqd~zOrfC`Anw`7i4K!d`atZ`)RiLimU0#OYy0f+W!FOFDa0_hoaF~lJCv^ z0|O^w*|#4b)YrCj6u0Ly>;K4C-2XTJb^kk3(b$1ZDqXxO3lscj63Gnq#0E!V$t;gh zSl(54Wm2PQ(X?~uR4mHd+l1|b%+Se5HuiXhrOw68TRML$axylMNy$!ayEs$(Mn`&f z9eATi)pq1&gqFigS~kBZxa^Ib+L?%C-%4eUbJpyih|{^n-`d(Lj(qUnwVcWkC`X_i zfpP?%bp-HviC^Lg2`f)X=gEg;7TT-XQ&36q?NX7{k>!hO@Z6;9#sZ`z;D1SFJEhb3 zTz8Hb(;$dXGp6f7o}*Ia;N|^Xm9L>Z3%fFydMT9o8zL zjCvY{Hf}dYjra^TBl)2b|H zZ$Q5qHC&GZw~E?fQVV`+^+w&io`XqA_^J#-E3f5-?R;S;$xl@Sclf*H^BL3Zx!R(Q z=O7;V3P7V~LbGVRVy1emn)=-edOcU+^Ao-LT28Yp@LfBM{dn4Yg<<=0VH5bPtO&$C z?MgxS1ziiqQw4mC6`^$Om_BFJ@^^?_iEx@Zu!`<3d6y869UOgq1v=RKTzH30a2HKWA2a)j(%vt1&h0;&X z_|wZKr~EY`S0lf=>tAi?ZgBp>gM|xIASc|w{KS~9!(85(r7#Di_Pq(WcK*VzvAq`B z&H0JXDW%#noyQXUg$Vl~wxI1m+{466&k=eCA~iNe$mG;*GG3UW0oSlXPUZ5tut_f8 z8w2h?2-P)JtqN{D8@QeGXviJ5#|y48p5F`c8ZTn`f@|jU>j9ex+|SK0y^DplP@gqJ z&2(kR@m1G_{S}|W+)Wl%0`BuQx*Eg~<4=7JF7eevJbUMA;SJ8^h6cxLYz$wn-{^R2 zHic_f&gF(-jA`IW$am%}nBWlTuLOG>_n+mst@#P$==$d;%yr0Vo^d*-aQft_I2~e~ z@`6*oz6m%rE1a^7QxG_Yz@mEZ75eu>OLs zuevr&eIZzX0a$-su=hyqSSaLpb#>vfK)vGy*M$8wu>Pz`vHrd_XF`xI(60e|*AnXw z)*f=h7+`$70Pi37#DZxxKZO4$v)FYS~3D=H#(N%v7D z{(8b7c;;T5_~&i-F0J4-Ovxa<3eG=Y8zipa_$?T}`0L{PHmgGRHC8Z4lY)O;;FAje zyudp`tWeAxfp;tTR)O~_c$dJ_3cgF=Hx+!Jz<;XX@TnV;L2fB{ufV4jykFq63O+3G z8_Iao0)MFJza#L^6&xN8AQ{Aj_W+C2yR2f61_hrKc&~zgPvB7ne_!Bn1^=PIvkLxW zfuC3K4+VZx8Slpe&nx=31b$t?e=hwf_%8*1N5St3{Goz>D)7$~{5JxJmv2kb?**_>i0pUHF`RE;`9h< z9ZZh4#v>= z9G;P+y*&3wX$wVZFV9C(;(5cj$X=eaq?G3@;f9StmQk*Q3(r2XfA#l&^7nqIi%Is& zYlLio0&**!1My`)CLxQzqvR-8mSvF04{dP_EZIvXKNpnxmGP6^YyoxbzfG7)k+ykB z#$Vzxj#9n?4RY*K2jz8}*Ji~57T3tjaLNAV{Np*Q$eo+8Jl@PJM;2jNl>JKhAQT>D NKO^i5ih)uk{{-s);-UZm literal 0 HcmV?d00001 diff --git a/test/simple_convolution/simple_convolution.cl b/test/simple_convolution/simple_convolution.cl new file mode 100644 index 00000000..3f8115a6 --- /dev/null +++ b/test/simple_convolution/simple_convolution.cl @@ -0,0 +1,76 @@ +/****************************************************************************** +Copyright (c) 2018 Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*******************************************************************************/ + +/** + * SimpleConvolution is where each pixel of the output image + * is the weighted sum of the neighborhood pixels of the input image + * The neighborhood is defined by the dimensions of the mask and + * weight of each neighbor is defined by the mask itself. + * @param output Output matrix after performing convolution + * @param input Input matrix on which convolution is to be performed + * @param mask mask matrix using which convolution was to be performed + * @param inputDimensions dimensions of the input matrix + * @param maskDimensions dimensions of the mask matrix + */ +__kernel void SimpleConvolution(__global uint * output, + __global uint * input, + __global float * mask, + const uint2 inputDimensions, + const uint2 maskDimensions) { + + uint tid = get_global_id(0); + + uint width = inputDimensions.x; + uint height = inputDimensions.y; + + uint x = tid%width; + uint y = tid/width; + + uint maskWidth = maskDimensions.x; + uint maskHeight = maskDimensions.y; + + uint vstep = (maskWidth -1)/2; + uint hstep = (maskHeight -1)/2; + + // find the left, right, top and bottom indices such that + // the indices do not go beyond image boundaires + uint left = (x < vstep) ? 0 : (x - vstep); + uint right = ((x + vstep) >= width) ? width - 1 : (x + vstep); + uint top = (y < hstep) ? 0 : (y - hstep); + uint bottom = ((y + hstep) >= height)? height - 1: (y + hstep); + + // initializing wighted sum value + float sumFX = 0; + + for(uint i = left; i <= right; ++i) { + for(uint j = top; j <= bottom; ++j) { + // performing wighted sum within the mask boundaries + uint maskIndex = (j - (y - hstep)) * maskWidth + (i - (x - vstep)); + uint index = j * width + i; + sumFX += ((float)input[index] * mask[maskIndex]); + } + } + + // To round to the nearest integer + sumFX += 0.5f; + output[tid] = (uint)sumFX; +} diff --git a/test/simple_convolution/simple_convolution.cpp b/test/simple_convolution/simple_convolution.cpp new file mode 100644 index 00000000..546f9a6a --- /dev/null +++ b/test/simple_convolution/simple_convolution.cpp @@ -0,0 +1,388 @@ +/****************************************************************************** +Copyright (c) 2018 Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*******************************************************************************/ + +#include "simple_convolution/simple_convolution.h" + +#include +#include +#include + +#include "util/helper_funcs.h" +#include "util/test_assert.h" + +const uint32_t SimpleConvolution::input_data_[]{ + 15, 201, 51, 89, 92, 34, 96, 66, 11, 225, 161, 96, 81, 211, 108, 124, 202, 244, 182, + 90, 215, 92, 98, 20, 44, 225, 55, 247, 202, 0, 45, 218, 202, 97, 51, 39, 131, 147, + 105, 143, 116, 11, 239, 198, 222, 92, 67, 169, 81, 250, 3, 40, 86, 101, 60, 131, 70, + 116, 123, 17, 117, 168, 236, 64, 10, 31, 103, 142, 179, 209, 29, 40, 220, 13, 239, 187, + 105, 50, 100, 186, 44, 104, 227, 131, 205, 32, 6, 20, 149, 130, 38, 10, 43, 18, 75, + 53, 50, 178, 195, 230, 132, 225, 14, 96, 238, 253, 27, 88, 48, 128, 18, 92, 232, 246, + 224, 182, 23, 231, 203, 172, 105, 241, 183, 148, 4, 2, 202, 55, 181, 142, 29, 57, 111, + 43, 153, 93, 41, 181, 181, 89, 54, 200, 182, 31, 190, 150, 213, 213, 126, 160, 130, 232, + 146, 57, 125, 151, 59, 71, 206, 240, 213, 236, 42, 68, 24, 195, 162, 65, 121, 87, 155, + 175, 31, 81, 207, 222, 232, 164, 180, 102, 69, 55, 79, 216, 112, 204, 112, 171, 19, 63, + 156, 233, 43, 198, 46, 67, 138, 208, 132, 4, 39, 32, 180, 71, 113, 131, 38, 90, 40, + 219, 193, 109, 18, 16, 70, 131, 220, 182, 46, 240, 245, 203, 217, 32, 146, 7, 100, 28, + 216, 233, 32, 255, 9, 213, 71, 123, 88, 110, 213, 128, 74, 150, 238, 93, 166, 52, 224, + 131, 234, 15, 115, 224, 218, 76, 1, 108, 84, 101, 137, 44, 79, 170, 44, 88, 127, 116, + 211, 216, 226, 168, 88, 45, 63, 70, 138, 230, 123, 107, 105, 101, 122, 220, 70, 84, 41, + 71, 193, 125, 173, 75, 169, 252, 245, 213, 84, 117, 73, 40, 77, 44, 209, 166, 90, 16, + 237, 229, 246, 104, 80, 95, 206, 202, 60, 20, 31, 101, 92, 225, 226, 9, 44, 140, 5, + 34, 97, 89, 151, 171, 129, 229, 216, 82, 139, 51, 99, 120, 24, 89, 225, 104, 185, 175, + 50, 246, 196, 82, 91, 32, 51, 62, 42, 96, 202, 47, 130, 44, 137, 26, 215, 10, 255, + 176, 93, 138, 227, 193, 3, 251, 27, 229, 100, 212, 149, 151, 202, 89, 233, 38, 122, 29, + 100, 164, 125, 46, 212, 0, 90, 93, 26, 50, 103, 25, 226, 197, 164, 198, 135, 168, 194, + 162, 141, 38, 119, 34, 190, 66, 124, 167, 104, 247, 197, 204, 156, 67, 251, 112, 67, 85, + 205, 93, 135, 53, 119, 106, 251, 28, 49, 130, 196, 243, 36, 82, 26, 155, 117, 216, 221, + 241, 128, 70, 233, 70, 18, 133, 137, 14, 245, 204, 99, 195, 42, 235, 248, 161, 86, 243, + 190, 135, 118, 130, 123, 154, 213, 150, 54, 74, 111, 20, 60, 240, 90, 37, 54, 109, 171, + 191, 123, 161, 140, 222, 100, 182, 202, 93, 88, 32, 80, 23, 168, 198, 153, 36, 97, 111, + 187, 151, 185, 43, 172, 245, 27, 6, 27, 82, 115, 199, 18, 239, 104, 158, 206, 205, 85, + 152, 42, 174, 185, 123, 197, 98, 65, 95, 135, 163, 206, 66, 59, 136, 109, 231, 125, 137, + 237, 153, 219, 97, 96, 237, 81, 201, 140, 31, 150, 226, 183, 192, 144, 113, 59, 86, 212, + 125, 182, 91, 33, 132, 158, 92, 12, 12, 68, 138, 149, 50, 36, 113, 147, 133, 95, 229, + 78, 235, 4, 228, 206, 188, 165, 95, 45, 225, 181, 1, 94, 107, 93, 128, 240, 251, 220, + 252, 7, 32, 135, 156, 83, 171, 14, 230, 48, 109, 203, 126, 89, 208, 99, 39, 140, 9, + 134, 185, 234, 60, 187, 73, 167, 24, 201, 152, 20, 166, 148, 27, 199, 28, 184, 26, 199, + 198, 0, 248, 52, 204, 119, 141, 157, 218, 181, 41, 227, 59, 227, 206, 119, 159, 23, 31, + 184, 224, 183, 204, 134, 76, 231, 77, 105, 160, 103, 48, 103, 104, 41, 155, 53, 160, 41, + 210, 123, 222, 252, 95, 26, 223, 45, 146, 126, 68, 177, 54, 37, 105, 3, 171, 182, 235, + 249, 31, 139, 97, 80, 243, 202, 121, 143, 0, 26, 184, 210, 149, 151, 207, 244, 177, 174, + 34, 67, 45, 102, 245, 100, 140, 95, 104, 55, 21, 83, 49, 53, 223, 147, 134, 210, 93, + 0, 97, 93, 26, 26, 48, 175, 178, 255, 164, 99, 174, 198, 167, 220, 45, 156, 64, 185, + 252, 168, 241, 18, 252, 35, 71, 219, 182, 205, 173, 19, 206, 15, 113, 232, 42, 161, 152, + 220, 160, 60, 64, 79, 3, 231, 43, 49, 132, 108, 235, 128, 21, 220, 146, 17, 255, 218, + 236, 182, 168, 154, 201, 118, 170, 58, 94, 212, 220, 246, 177, 125, 51, 241, 204, 55, 216, + 248, 104, 92, 100, 83, 221, 121, 48, 111, 138, 47, 73, 119, 230, 241, 17, 175, 103, 187, + 234, 198, 144, 199, 188, 65, 68, 240, 51, 17, 39, 11, 9, 143, 104, 109, 227, 70, 231, + 19, 181, 113, 66, 255, 233, 41, 241, 250, 217, 89, 182, 196, 31, 71, 139, 220, 137, 208, + 204, 188, 225, 243, 200, 234, 131, 48, 88, 102, 119, 63, 121, 44, 177, 188, 44, 154, 229, + 29, 149, 190, 118, 76, 130, 150, 147, 14, 114, 28, 222, 62, 217, 191, 50, 161, 170, 181, + 210, 2, 28, 73, 66, 149, 117, 243, 81, 162, 141, 55, 191, 35, 245, 54, 111, 120, 204, + 2, 134, 62, 31, 100, 125, 248, 36, 175, 153, 206, 101, 107, 209, 129, 181, 19, 22, 43, + 7, 104, 205, 149, 159, 140, 184, 149, 195, 39, 14, 143, 42, 148, 205, 73, 249, 74, 66, + 30, 250, 219, 237, 96, 71, 190, 225, 253, 210, 248, 40, 218, 96, 245, 111, 0, 130, 39, + 150, 69, 79, 165, 212, 122, 57, 162, 195, 51, 237, 6, 82, 231, 225, 63, 71, 41, 253, + 41, 38, 208, 33, 78, 170, 130, 68, 26, 131, 198, 66, 26, 12, 145, 191, 224, 11, 249, + 130, 207, 44, 112, 213, 126, 88, 183, 190, 160, 225, 187, 201, 8, 140, 235, 87, 55, 109, + 155, 81, 241, 98, 147, 11, 110, 37, 202, 79, 49, 195, 210, 0, 240, 66, 214, 110, 154, + 142, 44, 58, 111, 232, 4, 119, 117, 239, 207, 172, 93, 106, 254, 78, 205, 145, 89, 59, + 183, 35, 138, 232, 230, 92, 233, 214, 159, 191, 69, 58, 78, 114, 116, 189, 91, 121, 53, + 208, 104, 4, 125, 198, 111, 123, 20, 60, 13, 109, 120, 196, 145, 3, 172, 119, 95, 150, + 78, 255, 85, 147, 57, 163, 6, 174, 97, 97, 39, 151, 50, 144, 155, 175, 86, 11, 43, + 107, 71, 56, 216, 191, 253, 105, 194, 170, 225, 34, 64, 47, 34, 150, 195, 91, 58, 201, + 10, 155, 43, 49, 50, 93, 194, 206, 13, 25, 217, 56, 132, 33, 112, 92, 225, 109, 198, + 164, 23, 167, 199, 88, 215, 234, 238, 155, 69, 40, 100, 80, 196, 144, 129, 246, 237, 68, + 197, 250, 93, 159, 51, 225, 193, 163, 62, 163, 17, 4, 71, 41, 172, 15, 130, 132, 249, + 112, 31, 63, 152, 132, 143, 92, 20, 17, 83, 1, 86, 25, 252, 179, 185, 47, 149, 122, + 211, 211, 29, 229, 216, 101, 15, 133, 117, 145, 9, 111, 1, 40, 175, 154, 173, 62, 247, + 193, 80, 75, 194, 166, 100, 191, 90, 29, 239, 239, 152, 194, 195, 182, 168, 156, 27, 183, + 33, 145, 73, 43, 0, 75, 83, 175, 229, 0, 238, 221, 194, 63, 40, 133, 230, 140, 68, + 64, 170, 51, 48, 66, 246, 243, 248, 159, 144, 20, 87, 177, 165, 160, 220, 166, 235, 48, + 86, 209, 49, 68, 174, 243, 132, 214, 120, 106, 99, 189, 170, 13, 241, 219, 80, 232, 207, + 72, 135, 95, 92, 223, 16, 2, 127, 237, 169, 107, 29, 255, 61, 79, 68, 236, 67, 200, + 194, 188, 50, 38, 121, 221, 52, 107, 184, 132, 84, 136, 204, 219, 231, 41, 186, 248, 44, + 58, 229, 213, 166, 3, 212, 227, 82, 25, 207, 150, 225, 146, 82, 20, 185, 204, 242, 237, + 55, 170, 113, 139, 50, 62, 103, 26, 103, 34, 18, 148, 93, 247, 105, 3, 251, 62, 231, + 77, 87, 182, 227, 57, 73, 54, 77, 2, 2, 63, 239, 57, 234, 97, 197, 29, 159, 44, + 55, 7, 79, 74, 155, 172, 66, 5, 175, 61, 67, 150, 139, 155, 77, 111, 212, 151, 165, + 34, 153, 167, 98, 137, 225, 77, 234, 166, 107, 138, 211, 163, 145, 34, 237, 45, 206, 47, + 50, 126, 108, 117, 21, 248, 17, 98, 103, 230, 249, 12, 9, 147, 179, 107, 29, 149, 185, + 7, 59, 37, 146, 14, 200, 35, 49, 182, 80, 0, 230, 130, 126, 83, 248, 148, 75, 9, + 247, 178, 240, 240, 190, 249, 132, 114, 101, 161, 7, 30, 169, 67, 68, 59, 82, 12, 95, + 131, 195, 176, 131, 169, 51, 2, 252, 44, 150, 72, 54, 141, 250, 38, 126, 185, 31, 3, + 44, 132, 165, 52, 163, 78, 120, 231, 138, 202, 244, 234, 77, 183, 155, 209, 97, 207, 212, + 94, 251, 107, 166, 49, 249, 161, 88, 120, 91, 120, 123, 135, 253, 33, 188, 160, 112, 52, + 136, 250, 254, 125, 229, 76, 53, 128, 30, 150, 79, 243, 244, 75, 95, 155, 125, 88, 60, + 213, 209, 152, 78, 77, 32, 75, 110, 220, 236, 222, 17, 117, 217, 15, 242, 190, 92, 39, + 63, 123, 190, 143, 111, 178, 219, 206, 78, 88, 38, 138, 46, 247, 34, 124, 69, 66, 199, + 179, 31, 179, 145, 48, 41, 106, 64, 27, 41, 157, 67, 105, 24, 1, 249, 135, 179, 212, + 86, 1, 44, 124, 140, 91, 116, 175, 215, 185, 242, 159, 108, 17, 83, 254, 66, 124, 105, + 131, 151, 146, 32, 218, 252, 57, 219, 245, 193, 143, 201, 23, 145, 246, 148, 30, 82, 8, + 206, 41, 194, 192, 201, 47, 210, 28, 46, 20, 152, 151, 151, 48, 42, 184, 11, 38, 241, + 231, 28, 179, 119, 230, 202, 8, 220, 94, 39, 46, 103, 245, 88, 42, 181, 33, 90, 136, + 62, 136, 156, 214, 31, 52, 7, 74, 237, 19, 113, 223, 250, 141, 146, 113, 115, 92, 122, + 80, 187, 161, 126, 35, 150, 215, 78, 76, 249, 168, 212, 55, 48, 113, 14, 80, 166, 21, + 154, 147, 40, 12, 114, 35, 153, 5, 148, 12, 98, 15, 92, 29, 176, 219, 65, 71, 179, + 143, 147, 172, 56, 104, 227, 104, 218, 241, 185, 128, 7, 84, 20, 47, 96, 135, 82, 249, + 140, 231, 6, 238, 246, 99, 12, 167, 63, 77, 238, 242, 221, 130, 158, 21, 235, 129, 126, + 197, 114, 56, 69, 121, 140, 90, 169, 237, 225, 252, 231, 109, 228, 237, 91, 219, 81, 104, + 130, 144, 181, 113, 130, 147, 244, 32, 169, 223, 162, 39, 164, 21, 95, 234, 143, 236, 68, + 57, 217, 37, 53, 192, 147, 25, 174, 239, 245, 0, 87, 119, 144, 13, 232, 19, 160, 220, + 51, 73, 188, 214, 113, 96, 235, 209, 75, 122, 190, 144, 179, 151, 181, 233, 88, 73, 3, + 7, 56, 248, 7, 143, 112, 152, 156, 89, 171, 61, 53, 223, 135, 242, 181, 248, 83, 161, + 202, 158, 28, 136, 46, 208, 32, 228, 186, 121, 45, 189, 128, 102, 182, 136, 246, 38, 32, + 147, 127, 204, 208, 181, 171, 87, 167, 97, 80, 250, 2, 26, 153, 31, 163, 200, 239, 195, + 172, 169, 60, 218, 103, 188, 65, 30, 69, 55, 68, 102, 202, 196, 50, 154, 121, 221, 242, + 33, 63, 67, 28, 66, 93, 181, 97, 0, 126, 81, 196, 43, 251, 0, 5, 98, 189, 70, + 128, 3, 126, 197, 105, 72, 137, 155, 227, 3, 121, 214, 36, 184, 25, 65, 250, 118, 247, + 91, 119, 117, 173, 60, 160, 168, 60, 166, 10, 250, 237, 139, 253, 107, 80, 102, 180, 217, + 2, 151, 221, 123, 109, 1, 52, 134, 66, 46, 253, 57, 138, 117, 175, 55, 178, 79, 223, + 239, 245, 234, 233, 226, 117, 231, 78, 198, 78, 2, 159, 80, 154, 124, 204, 7, 126, 0, + 142, 193, 47, 140, 251, 185, 2, 170, 241, 180, 249, 208, 163, 239, 186, 141, 210, 48, 116, + 32, 246, 195, 34, 150, 19, 188, 19, 224, 196, 146, 224, 83, 83, 15, 224, 78, 201, 226, + 249, 186, 151, 243, 139, 58, 226, 70, 199, 181, 118, 60, 213, 109, 255, 248, 3, 19, 181, + 23, 243, 122, 169, 212, 205, 252, 228, 173, 75, 173, 144, 68, 104, 39, 55, 243, 98, 26, + 57, 41, 207, 175, 102, 165, 29, 102, 158, 32, 121, 83, 56, 109, 205, 225, 66, 155, 222, + 38, 73, 42, 212, 218, 110, 60, 1, 166, 48, 99, 193, 105, 141, 145, 25, 244, 54, 54, + 90, 213, 87, 212, 40, 143, 66, 246, 112, 132, 146, 79, 171, 220, 121, 128, 182, 232, 189, + 184, 143, 237, 27, 80, 86, 169, 226, 112, 158, 25, 166, 248, 238, 253, 204, 23, 141, 15, + 13, 254, 147, 160, 77, 63, 124, 199, 191, 50, 175, 124, 234, 62, 105, 6, 143, 192, 176, + 113, 48, 78, 139, 215, 71, 121, 213, 20, 144, 98, 35, 158, 96, 183, 62, 174, 246, 187, + 117, 182, 237, 37, 50, 216, 99, 156, 223, 243, 93, 143, 101, 142, 222, 240, 101, 37, 106, + 58, 57, 250, 157, 93, 153, 254, 20, 216, 172, 10, 147, 34, 192, 129, 71, 243, 90, 171, + 144, 57, 159, 238, 201, 4, 124, 167, 244, 225, 205, 95, 28, 7, 89, 185, 100, 243, 184, + 121, 203, 100, 131, 95, 135, 68, 224, 207, 56, 58, 122, 201, 115, 25, 183, 61, 30, 51, + 229, 18, 21, 178, 113, 49, 186, 203, 235, 31, 191, 163, 152, 138, 8, 28, 233, 143, 97, + 202, 95, 153, 4, 217, 98, 120, 243, 26, 182, 17, 77, 155, 36, 99, 78, 150, 149, 8, + 98, 128, 39, 33, 36, 192, 172, 45, 220, 149, 189, 61, 96, 28, 215, 100, 246, 58, 221, + 233, 84, 147, 251, 162, 47, 31, 5, 125, 181, 154, 134, 23, 27, 174, 57, 64, 110, 229, + 109, 75, 123, 43, 136, 219, 71, 95, 64, 61, 154, 29, 39, 238, 177, 34, 145, 225, 65, + 150, 94, 247, 49, 229, 15, 77, 147, 72, 141, 2, 45, 251, 77, 169, 38, 213, 132, 110, + 53, 196, 172, 207, 226, 212, 190, 148, 246, 79, 117, 56, 230, 212, 48, 23, 185, 63, 100, + 76, 136, 242, 78, 181, 237, 156, 95, 20, 113, 227, 131, 167, 168, 47, 119, 139, 3, 53, + 31, 250, 133, 149, 50, 107, 105, 99, 130, 34, 162, 231, 111, 42, 217, 190, 224, 199, 90, + 63, 220, 204, 35, 95, 115, 203, 143, 234, 86, 147, 32, 118, 141, 165, 11, 192, 16, 117, + 35, 147, 152, 198, 123, 7, 240, 84, 198, 209, 28, 33, 17, 248, 237, 52, 88, 97, 255, + 231, 76, 86, 122, 109, 204, 8, 18, 216, 201, 35, 77, 237, 183, 229, 179, 50, 237, 164, + 135, 179, 118, 164, 213, 135, 157, 195, 187, 245, 36, 187, 220, 113, 18, 87, 222, 222, 96, + 241, 183, 42, 21, 4, 23, 205, 233, 203, 0, 214, 112, 136, 138, 230, 44, 95, 110, 201, + 34, 41, 191, 71, 229, 155, 185, 247, 243, 151, 214, 84, 137, 141, 126, 159, 146, 149, 108, + 124, 97, 109, 82, 209, 245, 221, 183, 34, 60, 37, 236, 95, 79, 171, 167, 53, 71, 96, + 45, 58, 248, 3, 142, 129, 145, 12, 33, 36, 162, 142, 160, 3, 251, 243, 213, 240, 208, + 141, 19, 13, 178, 255, 109, 2, 170, 20, 55, 241, 116, 101, 44, 108, 105, 186, 238, 251, + 199, 15, 31, 106, 157, 191, 110, 152, 178, 67, 137, 131, 208, 156, 144, 131, 155, 253, 134, + 70, 18, 190, 55, 134, 35, 99, 243, 140, 30, 225, 135, 230, 240, 166, 81, 142, 102, 191, + 39, 25, 3, 177, 156, 211, 77, 45, 87, 233, 43, 221, 48, 61, 155, 103, 195, 191, 203, + 182, 75, 233, 152, 211, 208, 136, 121, 33, 23, 224, 224, 62, 249, 227, 239, 149, 183, 61, + 195, 15, 39, 238, 236, 87, 43, 136, 191, 239, 71, 138, 166, 147, 116, 62, 102, 68, 199, + 224, 101, 223, 193, 70, 29, 186, 42, 13, 80, 225, 75, 19, 241, 115, 1, 221, 202, 45, + 102, 137, 29, 174, 20, 195, 66, 136, 2, 168, 205, 201, 137, 50, 168, 74, 121, 198, 4, + 163, 212, 85, 133, 31, 105, 118, 146, 106, 84, 93, 152, 187, 231, 181, 105, 251, 121, 171, + 132, 123, 84, 81, 69, 221, 132, 238, 40, 253, 181, 45, 161, 137, 130, 39, 169, 235, 158, + 59, 86, 242, 153, 239, 173, 128, 165, 23, 123, 30, 195, 0, 154, 23, 81, 224, 245, 214, + 206, 30, 212, 131, 75, 117, 12, 206, 157, 181, 186, 59, 241, 17, 45, 138, 0, 219, 11, + 165, 243, 135, 196, 182, 135, 95, 205, 217, 63, 195, 175, 14, 225, 131, 145, 45, 249, 158, + 251, 150, 84, 182, 209, 70, 199, 255, 209, 199, 219, 220, 109, 206, 99, 50, 132, 234, 146, + 82, 195, 209, 22, 114, 223, 247, 246, 113, 37, 239, 16, 33, 134, 100, 215, 88, 170, 158, + 87, 123, 102, 50, 88, 211, 1, 187, 6, 134, 165, 152, 216, 105, 106, 239, 220, 74, 231, + 210, 187, 12, 194, 204, 45, 72, 49, 4, 160, 219, 162, 248, 87, 8, 43, 176, 220, 44, + 107, 227, 178, 17, 124, 139, 122, 230, 122, 87, 48, 97, 42, 236, 110, 236, 185, 155, 53, + 234, 159, 214, 198, 66, 206, 30, 75, 249, 206, 40, 38, 57, 11, 217, 74, 136, 100, 197, + 110, 223, 29, 159, 65, 71, 140, 175, 51, 69, 74, 105, 48, 234, 63, 246, 45, 13, 20, + 121, 7, 226, 161, 46, 28, 173, 7, 103, 53, 108, 45, 164, 76, 74, 68, 141, 145, 208, + 61, 197, 22, 136, 46, 70, 115, 110, 60, 161, 124, 81, 26, 132, 51, 188, 178, 79, 106, + 186, 183, 160, 39, 228, 68, 115, 46, 136, 1, 192, 89, 62, 133, 112, 198, 180, 182, 58, + 34, 243, 219, 158, 69, 245, 34, 120, 178, 213, 200, 28, 143, 128, 188, 182, 100, 1, 41, + 146, 137, 43, 82, 227, 105, 216, 83, 48, 140, 10, 106, 175, 254, 70, 77, 67, 59, 112, + 188, 237, 69, 133, 10, 212, 5, 198, 138, 105, 199, 180, 252, 81, 223, 79, 53, 73, 39, + 137, 121, 180, 148, 228, 99, 146, 42, 177, 214, 102, 33, 147, 84, 102, 25, 94, 59, 31, + 37, 197, 137, 237, 122, 133, 63, 90, 213, 116, 163, 253, 253, 29, 177, 145, 2, 21, 36, + 45, 198, 251, 147, 231, 143, 232, 78, 168, 71, 137, 199, 108, 79, 80, 90, 201, 214, 153, + 35, 172, 13, 199, 169, 11, 228, 91, 157, 231, 112, 193, 20, 54, 189, 167, 30, 77, 144, + 108, 245, 215, 246, 189, 68, 69, 14, 158, 14, 228, 55, 50, 145, 69, 249, 58, 80, 222, + 149, 237, 198, 5, 175, 218, 60, 109, 130, 91, 186, 18, 200, 175, 234, 190, 109, 46, 3, + 123, 204, 18, 96, 4, 68, 241, 73, 62, 44, 154, 29, 193, 136, 227, 199, 55, 189, 4, + 164, 64, 95, 95, 82, 39, 15, 60, 230, 124, 107, 233, 248, 55, 251, 89, 60, 63, 75, + 134, 126, 119, 32, 156, 57, 168, 127, 0, 224, 61, 5, 133, 125, 100, 228, 208, 140, 243, + 12, 114, 111, 119, 92, 104, 175, 87, 193, 236, 151, 13, 114, 21, 132, 146, 177, 189, 59, + 49, 190, 27, 110, 195, 160, 236, 40, 132, 188, 181, 120, 201, 40, 232, 65, 132, 80, 241, + 220, 18, 221, 115, 31, 79, 137, 164, 226, 58, 98, 29, 108, 32, 57, 219, 228, 218, 199, + 13, 95, 132, 195, 215, 77, 235, 191, 143, 112, 16, 128, 76, 35, 93, 191, 66, 173, 73, + 231, 143, 132, 73, 173, 240, 106, 231, 203, 78, 193, 147, 92, 33, 23, 31, 248, 100, 11, + 184, 243, 123, 201, 115, 200, 236, 209, 135, 47, 126, 209, 22, 14, 85, 95, 188, 69, 202, + 163, 17, 24, 101, 164, 117, 134, 187, 148, 127, 31, 159, 55, 19, 27, 1, 135, 227, 237, + 89, 107, 28, 216, 60, 51, 230, 145, 147, 163, 215, 93, 70, 232, 118, 172, 140, 235, 50, + 71, 128, 177, 103, 32, 233, 123, 60, 234, 2, 31, 216, 91, 139, 244, 52, 200, 40, 26, + 90, 188, 189, 49, 25, 4, 25, 144, 176, 166, 124, 227, 237, 252, 148, 85, 29, 125, 208, + 89, 104, 210, 121, 64, 46, 4, 53, 99, 204, 93, 125, 38, 25, 59, 88, 51, 64, 113, + 195, 241, 23, 64, 212, 5, 60, 104, 90, 90, 230, 42, 179, 78, 253, 44, 143, 44, 49, + 196, 143, 254, 34, 13, 36, 60, 73, 125, 112, 137, 239, 52, 122, 7, 116, 79, 12, 177, + 183, 103, 11, 158, 146, 190, 237, 143, 235, 124, 188, 28, 65, 76, 26, 100, 89, 63, 160, + 163, 188, 17, 44, 172, 69, 167, 179, 185, 246, 191, 107, 174, 38, 118, 76, 184, 53, 58, + 72, 32, 182, 5, 61, 248, 81, 88, 92, 170, 152, 253, 77, 84, 14, 122, 1, 83, 34, + 180, 13, 25, 115, 120, 199, 154, 238, 20, 83, 36, 79, 155, 68, 5, 160, 130, 254, 242, + 218, 90, 156, 114, 87, 234, 199, 101, 101, 200, 185, 135, 124, 198, 160, 240, 62, 104, 138, + 45, 125, 222, 81, 204, 122, 150, 210, 26, 24, 208, 12, 242, 42, 169, 101, 130, 148, 44, + 232, 249, 245, 161, 128, 113, 103, 33, 98, 166, 137, 236, 212, 7, 202, 38, 211, 69, 188, + 165, 95, 212, 118, 108, 199, 161, 22, 45, 35, 170, 90, 11, 163, 79, 173, 36, 193, 20, + 69, 35, 187, 207, 16, 144, 214, 219, 182, 170, 32, 114, 79, 128, 71, 198, 237, 15, 103, + 4, 60, 139, 175, 150, 151, 82, 230, 68, 119, 168, 89, 188, 204, 20, 140, 220, 165, 98, + 184, 91, 12, 217, 205, 92, 90, 20, 35, 71, 36, 138, 76, 96, 22, 251, 247, 173, 78, + 222, 241, 197, 134, 75, 130, 83, 96, 14, 47, 5, 113, 232, 96, 126, 193, 45, 218, 28, + 66, 253, 99, 103, 136, 176, 200, 158, 171, 191, 76, 249, 158, 62, 190, 37, 137, 65, 120, + 233, 80, 168, 238, 193, 145, 79, 63, 82, 125, 26, 111, 191, 24, 210, 39, 161, 131, 239, + 64, 46, 175, 140, 39, 77, 202, 230, 115, 84, 40, 235, 62, 120, 148, 45, 57, 37, 124, + 121, 120, 249, 148, 231, 185, 172, 186, 224, 77, 61, 207, 141, 107, 126, 26, 147, 204, 229, + 121, 63, 58, 161, 43, 120, 25, 191, 165, 83, 228, 34, 205, 92, 27, 97, 67, 213, 13, + 253, 182, 91, 59, 133, 233, 166, 4, 4, 57, 209, 233, 179, 16, 35, 85, 59, 155, 111, + 250, 65, 194, 223, 99, 144, 59, 127, 241, 127, 85, 255, 125, 11, 90, 184, 145, 68, 95, + 150, 72, 153, 103, 49, 76, 120, 85, 161, 179, 241, 16, 174, 51, 211, 142, 150, 99, 201, + 22, 85, 73, 108, 84, 199, 120, 175, 128, 9, 243, 223, 160, 59, 120, 8, 109, 197, 128, + 194, 103, 52, 180, 119, 227, 231, 75, 113, 126, 175, 59, 148, 4, 132, 1, 89, 75, 121, + 8, 204, 131, 251, 171, 36, 55, 36, 44, 165, 233, 172, 103, 80, 224, 28, 200, 195, 3, + 20, 53, 129, 195, 112, 22, 200, 244, 23, 34, 64, 145, 42, 12, 20, 38, 184, 56, 94, + 220, 101, 3, 198, 17, 107, 22, 242, 135, 222, 182, 138, 243, 235, 11, 182, 91, 34, 127, + 80, 58, 161, 145, 203, 204, 158, 224, 242, 86, 24, 81, 51, 126, 84, 249, 143, 191, 15, + 130, 70, 238, 57, 209, 225, 36, 221, 152, 128, 255, 24, 208, 57, 186, 97, 4, 134, 255, + 229, 121, 86, 254, 202, 137, 124, 31, 130, 12, 222, 146, 142, 37, 129, 199, 247, 98, 236, + 212, 251, 108, 211, 20, 60, 13, 206, 158, 18, 84}; + +SimpleConvolution::SimpleConvolution() { + width_ = 64; + height_ = 64; + mask_width_ = 3; + mask_height_ = mask_width_; + randomize_seed_ = 0; + + if (!IsPowerOf2(width_)) { + width_ = RoundToPowerOf2(width_); + } + + if (!IsPowerOf2(height_)) { + height_ = RoundToPowerOf2(height_); + } + + if (!(mask_width_ % 2)) { + mask_width_++; + } + + if (!(mask_height_ % 2)) { + mask_height_++; + } + + if (width_ * height_ < 256) { + width_ = 64; + height_ = 64; + } + + const uint32_t input_size_bytes = width_ * height_ * sizeof(uint32_t); + const uint32_t mask_size_bytes = mask_width_ * mask_height_ * sizeof(float); + + SetInDescr(KERNARG_BUF_ID, KERNARG_DES_ID, sizeof(kernel_args_t)); + SetInDescr(INPUT_BUF_ID, SYS_DES_ID, input_size_bytes); + SetInDescr(MASK_BUF_ID, SYS_DES_ID, mask_size_bytes); + SetOutDescr(LOCAL_BUF_ID, LOCAL_DES_ID, input_size_bytes); + SetHostDescr(REFOUT_BUF_ID, REFOUT_DES_ID, input_size_bytes); + + if (!randomize_seed_) TEST_ASSERT(sizeof(input_data_) <= input_size_bytes); +} + +void SimpleConvolution::Init() { + std::clog << "SimpleConvolution::init :" << std::endl; + + mem_descr_t kernarg_des = GetDescr(KERNARG_BUF_ID); + mem_descr_t input_des = GetDescr(INPUT_BUF_ID); + mem_descr_t mask_des = GetDescr(MASK_BUF_ID); + mem_descr_t output_des = GetDescr(LOCAL_BUF_ID); +#if 0 + printf("kernarg_des %p 0x%x\n", kernarg_des.ptr, kernarg_des.size); + printf("input_des %p 0x%x\n", input_des.ptr, input_des.size); + printf("mask_des %p 0x%x\n", mask_des.ptr, mask_des.size); + printf("output_des %p 0x%x\n", output_des.ptr, output_des.size); +#endif + uint32_t* input = reinterpret_cast(input_des.ptr); + uint32_t* output_local = reinterpret_cast(output_des.ptr); + float* mask = reinterpret_cast(mask_des.ptr); + kernel_args_t* kernel_args = reinterpret_cast(kernarg_des.ptr); + + if (randomize_seed_) { + // random initialisation of input + FillRandom(input, width_, height_, 0, 255, randomize_seed_); + } else { + // initialization with preset values + memcpy(input, input_data_, width_ * height_ * sizeof(uint32_t)); + } + + // Fill a blurr filter or some other filter of your choice + const float val = 1.0f / (mask_width_ * 2.0f - 1.0f); + for (uint32_t i = 0; i < (mask_width_ * mask_height_); i++) { + mask[i] = 0; + } + for (uint32_t i = 0; i < mask_width_; i++) { + uint32_t y = mask_height_ / 2; + mask[y * mask_width_ + i] = val; + } + for (uint32_t i = 0; i < mask_height_; i++) { + uint32_t x = mask_width_ / 2; + mask[i * mask_width_ + x] = val; + } + + // Print the INPUT array. + std::clog << std::dec; + PrintArray("> Input[0]", input, width_, 1); + PrintArray("> Mask", mask, mask_width_, mask_height_); + + // Fill the kernel args + kernel_args->arg1 = output_local; + kernel_args->arg2 = input; + kernel_args->arg3 = mask; + kernel_args->arg4 = width_; + kernel_args->arg41 = height_; + kernel_args->arg5 = mask_width_; + kernel_args->arg51 = mask_height_; + + // Calculate the reference output + ReferenceImplementation(reinterpret_cast(GetRefOut()), input, mask, width_, height_, + mask_width_, mask_height_); +} + +void SimpleConvolution::PrintOutput(const void* ptr) const { + PrintArray("> Output[0]", reinterpret_cast(ptr), width_, 1); +} + +bool SimpleConvolution::ReferenceImplementation(uint32_t* output, const uint32_t* input, + const float* mask, const uint32_t width, + const uint32_t height, const uint32_t mask_width, + const uint32_t mask_height) { + const uint32_t vstep = (mask_width - 1) / 2; + const uint32_t hstep = (mask_height - 1) / 2; + + // for each pixel in the input + for (uint32_t x = 0; x < width; x++) { + for (uint32_t y = 0; y < height; y++) { + // find the left, right, top and bottom indices such that + // the indices do not go beyond image boundaires + const uint32_t left = (x < vstep) ? 0 : (x - vstep); + const uint32_t right = ((x + vstep) >= width) ? width - 1 : (x + vstep); + const uint32_t top = (y < hstep) ? 0 : (y - hstep); + const uint32_t bottom = ((y + hstep) >= height) ? height - 1 : (y + hstep); + + // initializing wighted sum value + float sum_fx = 0; + for (uint32_t i = left; i <= right; ++i) { + for (uint32_t j = top; j <= bottom; ++j) { + // performing wighted sum within the mask boundaries + uint32_t mask_idx = (j - (y - hstep)) * mask_width + (i - (x - vstep)); + uint32_t index = j * width + i; + + // to round to the nearest integer + sum_fx += ((float)input[index] * mask[mask_idx]); + } + } + sum_fx += 0.5f; + output[y * width + x] = uint32_t(sum_fx); + } + } + + return true; +} diff --git a/test/simple_convolution/simple_convolution.h b/test/simple_convolution/simple_convolution.h new file mode 100644 index 00000000..550d1320 --- /dev/null +++ b/test/simple_convolution/simple_convolution.h @@ -0,0 +1,94 @@ +/****************************************************************************** +Copyright (c) 2018 Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*******************************************************************************/ + +#ifndef TEST_SIMPLE_CONVOLUTION_SIMPLE_CONVOLUTION_H_ +#define TEST_SIMPLE_CONVOLUTION_SIMPLE_CONVOLUTION_H_ + +#include +#include + +#include "ctrl/test_kernel.h" + +// Class implements SimpleConvolution kernel parameters +class SimpleConvolution : public TestKernel { + public: + // Kernel buffers IDs + enum { INPUT_BUF_ID, LOCAL_BUF_ID, MASK_BUF_ID, KERNARG_BUF_ID, REFOUT_BUF_ID }; + + // Constructor + SimpleConvolution(); + + // Initialize method + void Init(); + + // Return compute grid size + uint32_t GetGridSize() const { return width_ * height_; } + + // Print output + void PrintOutput(const void* ptr) const; + + // Return name + std::string Name() const { return std::string("SimpleConvolution"); } + + private: + // Local kernel arguments declaration + struct kernel_args_t { + void* arg1; + void* arg2; + void* arg3; + uint32_t arg4; + uint32_t arg41; + uint32_t arg5; + uint32_t arg51; + }; + + // Reference CPU implementation of Simple Convolution + // @param output Output matrix after performing convolution + // @param input Input matrix on which convolution is to be performed + // @param mask mask matrix using which convolution was to be performed + // @param input_dimensions dimensions of the input matrix + // @param mask_dimensions dimensions of the mask matrix + // @return bool true on success and false on failure + bool ReferenceImplementation(uint32_t* output, const uint32_t* input, const float* mask, + const uint32_t width, const uint32_t height, + const uint32_t maskWidth, const uint32_t maskHeight); + + // Width of the Input array + uint32_t width_; + + // Height of the Input array + uint32_t height_; + + // Mask dimensions + uint32_t mask_width_; + + // Mask dimensions + uint32_t mask_height_; + + // Randomize input data + unsigned randomize_seed_; + + // Input data + static const uint32_t input_data_[]; +}; + +#endif // TEST_SIMPLE_CONVOLUTION_SIMPLE_CONVOLUTION_H_ diff --git a/test/tool/gfx_metrics.xml b/test/tool/gfx_metrics.xml new file mode 100644 index 00000000..899ca85e --- /dev/null +++ b/test/tool/gfx_metrics.xml @@ -0,0 +1,69 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/test/tool/input.xml b/test/tool/input.xml new file mode 100644 index 00000000..f4ecd178 --- /dev/null +++ b/test/tool/input.xml @@ -0,0 +1,14 @@ +# Filter by dispatches range, GPU index and kernel names + + +# List of metrics + diff --git a/test/tool/metrics.xml b/test/tool/metrics.xml new file mode 100644 index 00000000..a346eee9 --- /dev/null +++ b/test/tool/metrics.xml @@ -0,0 +1,205 @@ +#include "gfx_metrics.xml" + + + # average for 16 instances + + + + # sum for 16 instances + + + + + + + + + # FETCH_SIZE, kilobytes + # The total kilobytes fetched from the video memory. This is measured with all extra fetches and any cache or memory effects taken into account. + + # WRITE_SIZE, kilobytes + # The total kilobytes written to the video memory. This is measured with all extra fetches and any cache or memory effects taken into account. + + + + + # average for 16 instances + + + + # sum for 16 instances + + + + + + + + + + + # FETCH_SIZE, kilobytes + # The total kilobytes fetched from the video memory. This is measured with all extra fetches and any cache or memory effects taken into account. + + # WRITE_SIZE, kilobytes + # The total kilobytes written to the video memory. This is measured with all extra fetches and any cache or memory effects taken into account. + + + + + # GPUBusy, percentage + # The percentage of time GPU was busy. + + + # Wavefronts Total wavefronts., + + + # VALUInsts The average number of vector ALU instructions executed per work-item (affected by flow control). + + + # SALUInsts The average number of scalar ALU instructions executed per work-item (affected by flow control). + + + # VFetchInsts The average number of vector fetch instructions from the video memory executed per work-item (affected by flow control). Excludes FLAT instructions that fetch from video memory. + + + # SFetchInsts The average number of scalar fetch instructions from the video memory executed per work-item (affected by flow control). + + + # VWriteInsts The average number of vector write instructions to the video memory executed per work-item (affected by flow control). Excludes FLAT instructions that write to video memory. + + + # FlatVMemInsts The average number of FLAT instructions that read from or write to the video memory executed per work item (affected by flow control). Includes FLAT instructions that read from or write to scratch. + + + # LDSInsts The average number of LDS read or LDS write instructions executed per work item (affected by flow control). Excludes FLAT instructions that read from or write to LDS. + + + # FlatLDSInsts The average number of FLAT instructions that read or write to LDS executed per work item (affected by flow control). + + + # GDSInsts The average number of GDS read or GDS write instructions executed per work item (affected by flow control). + + + # VALUUtilization The percentage of active vector ALU threads in a wave. A lower number can mean either more thread divergence in a wave or that the work-group size is not a multiple of 64. Value range: 0% (bad), 100% (ideal - no thread divergence). + + + # VALUBusy The percentage of GPUTime vector ALU instructions are processed. Value range: 0% (bad) to 100% (optimal). + + + # SALUBusy The percentage of GPUTime scalar ALU instructions are processed. Value range: 0% (bad) to 100% (optimal). + + + # FetchSize The total kilobytes fetched from the video memory. This is measured with all extra fetches and any cache or memory effects taken into account. + + + # WriteSize The total kilobytes written to the video memory. This is measured with all extra fetches and any cache or memory effects taken into account. + + + # L2CacheHit The percentage of fetch, write, atomic, and other instructions that hit the data in L2 cache. Value range: 0% (no hit) to 100% (optimal). + + + # MemUnitBusy The percentage of GPUTime the memory unit is active. The result includes the stall time (MemUnitStalled). This is measured with all extra fetches and writes and any cache or memory effects taken into account. Value range: 0% to 100% (fetch-bound). + + + # MemUnitStalled The percentage of GPUTime the memory unit is stalled. Try reducing the number or size of fetches and writes if possible. Value range: 0% (optimal) to 100% (bad). + + + # WriteUnitStalled The percentage of GPUTime the Write unit is stalled. Value range: 0% to 100% (bad). + + + # The percentage of GPUTime ALU units are stalled by the LDS input queue being full or the output queue being not ready. If there are LDS bank conflicts, reduce them. Otherwise, try reducing the number of LDS accesses if possible. Value range: 0% (optimal) to 100% (bad). + + + # LDSBankConflict The percentage of GPUTime LDS is stalled by bank conflicts. Value range: 0% (optimal) to 100% (bad). + + + diff --git a/test/tool/tool.cpp b/test/tool/tool.cpp new file mode 100644 index 00000000..0eb79940 --- /dev/null +++ b/test/tool/tool.cpp @@ -0,0 +1,1048 @@ +/****************************************************************************** +Copyright (c) 2018 Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*******************************************************************************/ + +/////////////////////////////////////////////////////////////////////////////// +// // +// Test tool used as ROC profiler library demo // +// // +/////////////////////////////////////////////////////////////////////////////// + +#include +#include +#include +#include +#include +#include +#include +#include /* For SYS_xxx definitions */ +#include +#include + +#include +#include +#include +#include +#include +#include + +#include "inc/rocprofiler.h" +#include "util/hsa_rsrc_factory.h" +#include "util/xml.h" + +#define PUBLIC_API __attribute__((visibility("default"))) +#define CONSTRUCTOR_API __attribute__((constructor)) +#define DESTRUCTOR_API __attribute__((destructor)) +#define KERNEL_NAME_LEN_MAX 128 + +// Disoatch callback data type +struct callbacks_data_t { + rocprofiler_feature_t* features; + unsigned feature_count; + std::vector* set; + unsigned group_index; + FILE* file_handle; + int filter_on; + std::vector* gpu_index; + std::vector* kernel_string; + std::vector* range; +}; + +// Context stored entry type +struct context_entry_t { + uint32_t valid; + uint32_t index; + hsa_agent_t agent; + rocprofiler_group_t group; + rocprofiler_feature_t* features; + unsigned feature_count; + rocprofiler_callback_data_t data; + FILE* file_handle; +}; + +// +const std::string rcfile_name = "rpl_rc.xml"; +// verbose mode +static uint32_t verbose = 0; +// Enable tracing +static const bool trace_on = false; +// Tool is unloaded +volatile bool is_loaded = false; +// Dispatch callbacks and context handlers synchronization +pthread_mutex_t mutex = PTHREAD_RECURSIVE_MUTEX_INITIALIZER_NP; +// Dispatch callback data +callbacks_data_t* callbacks_data = NULL; +// Stored contexts array +typedef std::map context_array_t; +context_array_t* context_array = NULL; +typedef std::list wait_list_t; +wait_list_t* wait_list = NULL; +// Contexts collected count +volatile uint32_t context_count = 0; +volatile uint32_t context_collected = 0; +// Profiling results output file name +const char* result_prefix = NULL; +// Global results file handle +FILE* result_file_handle = NULL; +// True if a result file is opened +bool result_file_opened = false; +// Dispatch filters +// Metrics set +std::vector* metrics_set = NULL; +// GPU index filter +std::vector* gpu_index_vec = NULL; +// Kernel name filter +std::vector* kernel_string_vec = NULL; +// DIspatch number range filter +std::vector* range_vec = NULL; +// Otstanding dispatches parameters +static uint32_t CTX_OUTSTANDING_MAX = 0; +static uint32_t CTX_OUTSTANDING_MON = 0; +// to truncate kernel names +uint32_t to_truncate_names = 0; +// local SQTT buffer +bool is_sqtt_local = true; + +static inline uint32_t GetPid() { return syscall(__NR_getpid); } +static inline uint32_t GetTid() { return syscall(__NR_gettid); } + +// Error handler +void fatal(const std::string msg) { + fflush(stdout); + fprintf(stderr, "%s\n\n", msg.c_str()); + fflush(stderr); + abort(); +} + +// Check returned HSA API status +void check_status(hsa_status_t status) { + if (status != HSA_STATUS_SUCCESS) { + const char* error_string = NULL; + rocprofiler_error_string(&error_string); + fprintf(stderr, "ERROR: %s\n", error_string); + abort(); + } +} + +std::string filtr_kernel_name(const std::string name) { + auto rit = name.rbegin(); + auto rend = name.rend(); + uint32_t counter = 0; + char open_token = 0; + char close_token = 0; + while (rit != rend) { + if (counter == 0) { + switch (*rit) { + case ')': + counter = 1; + open_token = ')'; + close_token = '('; + break; + case '>': + counter = 1; + open_token = '>'; + close_token = '<'; + break; + } + if (counter == 0) break; + } else { + if (*rit == open_token) counter++; + if (*rit == close_token) counter--; + } + ++rit; + } + while (((*rit == ' ') || (*rit == ' ')) && (rit != rend)) rit++; + auto rbeg = rit; + while ((*rit != ' ') && (*rit != ':') && (rit != rend)) rit++; + const uint32_t pos = rend - rit; + const uint32_t length = rit - rbeg; + return name.substr(pos, length); +} + +void* monitor_thr_fun(void*) { + while (context_array != NULL) { + sleep(CTX_OUTSTANDING_MON); + if (pthread_mutex_lock(&mutex) != 0) { + perror("pthread_mutex_lock"); + abort(); + } + const uint32_t inflight = context_count - context_collected; + std::cerr << std::flush; + std::clog << std::flush; + std::cout << "ROCProfiler: count(" << context_count << "), outstanding(" << inflight << "/" << CTX_OUTSTANDING_MAX << ")" << std::endl << std::flush; + if (pthread_mutex_unlock(&mutex) != 0) { + perror("pthread_mutex_unlock"); + abort(); + } + } + return NULL; +} + +uint32_t next_context_count() { + if (pthread_mutex_lock(&mutex) != 0) { + perror("pthread_mutex_lock"); + abort(); + } + ++context_count; + if (pthread_mutex_unlock(&mutex) != 0) { + perror("pthread_mutex_unlock"); + abort(); + } + return context_count; +} + +// Allocate entry to store profiling context +context_entry_t* alloc_context_entry() { + if (CTX_OUTSTANDING_MAX != 0) { + while((context_count - context_collected) > CTX_OUTSTANDING_MAX) usleep(1000); + } + + if (pthread_mutex_lock(&mutex) != 0) { + perror("pthread_mutex_lock"); + abort(); + } + + const uint32_t index = next_context_count() - 1; + auto ret = context_array->insert({index, context_entry_t{}}); + if (ret.second == false) { + fprintf(stderr, "context_array corruption, index repeated %u\n", index); + abort(); + } + + if (pthread_mutex_unlock(&mutex) != 0) { + perror("pthread_mutex_unlock"); + abort(); + } + + context_entry_t* entry = &(ret.first->second); + entry->index = index; + return entry; +} + +// Allocate entry to store profiling context +void dealloc_context_entry(context_entry_t* entry) { + if (pthread_mutex_lock(&mutex) != 0) { + perror("pthread_mutex_lock"); + abort(); + } + + assert(context_array != NULL); + context_array->erase(entry->index); + + if (pthread_mutex_unlock(&mutex) != 0) { + perror("pthread_mutex_unlock"); + abort(); + } +} + +// Dump trace data to file +void dump_sqtt_trace(const char* label, const uint32_t chunk, const void* data, const uint32_t& size) { + if (result_prefix != NULL) { + // Open SQTT file + std::ostringstream oss; + oss << result_prefix << "/thread_trace_" << label << "_se" << chunk << ".out"; + FILE* file = fopen(oss.str().c_str(), "w"); + if (file == NULL) { + std::ostringstream errmsg; + errmsg << "fopen error, file '" << oss.str().c_str() << "'"; + perror(errmsg.str().c_str()); + abort(); + } + + // Write the buffer in terms of shorts (16 bits) + const unsigned short* ptr = reinterpret_cast(data); + for (uint32_t i = 0; i < (size / sizeof(short)); ++i) { + fprintf(file, "%04x\n", ptr[i]); + } + + // Close SQTT file + fclose(file); + } +} + +struct trace_data_arg_t { + FILE* file; + const char* label; + hsa_agent_t agent; +}; + +// Trace data callback for getting trace data from GPU local mamory +hsa_status_t trace_data_cb(hsa_ven_amd_aqlprofile_info_type_t info_type, + hsa_ven_amd_aqlprofile_info_data_t* info_data, void* data) { + hsa_status_t status = HSA_STATUS_SUCCESS; + trace_data_arg_t* arg = reinterpret_cast(data); + if (info_type == HSA_VEN_AMD_AQLPROFILE_INFO_SQTT_DATA) { + const void* data_ptr = info_data->sqtt_data.ptr; + const uint32_t data_size = info_data->sqtt_data.size; + fprintf(arg->file, " SE(%u) size(%u)\n", info_data->sample_id, data_size); + + if (is_sqtt_local) { + HsaRsrcFactory* hsa_rsrc = &HsaRsrcFactory::Instance(); + const AgentInfo* agent_info = hsa_rsrc->GetAgentInfo(arg->agent); + const uint32_t mem_size = data_size; + void* buffer = hsa_rsrc->AllocateSysMemory(agent_info, mem_size); + if(!hsa_rsrc->Memcpy(agent_info, buffer, data_ptr, mem_size)) { + fatal("SQTT data memcopy to host failed"); + } + dump_sqtt_trace(arg->label, info_data->sample_id, buffer, data_size); + HsaRsrcFactory::FreeMemory(buffer); + } else { + dump_sqtt_trace(arg->label, info_data->sample_id, data_ptr, data_size); + } + } else + status = HSA_STATUS_ERROR; + return status; +} + +// Align to specified alignment +unsigned align_size(unsigned size, unsigned alignment) { + return ((size + alignment - 1) & ~(alignment - 1)); +} + +// Output profiling results for input features +void output_results(const context_entry_t* entry, const char* label) { + FILE* file = entry->file_handle; + const rocprofiler_feature_t* features = entry->features; + const unsigned feature_count = entry->feature_count; + rocprofiler_t* context = entry->group.context; + + for (unsigned i = 0; i < feature_count; ++i) { + const rocprofiler_feature_t* p = &features[i]; + fprintf(file, " %s ", p->name); + switch (p->data.kind) { + // Output metrics results + case ROCPROFILER_DATA_KIND_INT64: + fprintf(file, "(%lu)\n", p->data.result_int64); + break; + // Output trace results + case ROCPROFILER_DATA_KIND_BYTES: { + if (p->data.result_bytes.copy) { + uint64_t size = 0; + + const char* ptr = reinterpret_cast(p->data.result_bytes.ptr); + const char* end = reinterpret_cast(ptr + p->data.result_bytes.size); + for (unsigned i = 0; i < p->data.result_bytes.instance_count; ++i) { + const uint32_t chunk_size = *reinterpret_cast(ptr); + const char* chunk_data = ptr + sizeof(uint32_t); + if (chunk_data >= end) fatal("SQTT data is out of the result buffer size"); + + dump_sqtt_trace(label, i, chunk_data, chunk_size); + const uint32_t off = align_size(chunk_size, sizeof(uint32_t)); + ptr = chunk_data + off; + if (chunk_data >= end) fatal("SQTT data ptr is out of the result buffer size"); + size += chunk_size; + } + fprintf(file, "size(%lu)\n", size); + HsaRsrcFactory::FreeMemory(p->data.result_bytes.ptr); + const_cast(p)->data.result_bytes.size = 0; + } else { + fprintf(file, "(\n"); + trace_data_arg_t trace_data_arg{file, label, entry->agent}; + hsa_status_t status = rocprofiler_iterate_trace_data(context, trace_data_cb, reinterpret_cast(&trace_data_arg)); + check_status(status); + fprintf(file, " )\n"); + } + break; + } + default: + fprintf(stderr, "RPL-tool: undefined data kind(%u)\n", p->data.kind); + abort(); + } + } +} + +// Output group intermeadate profiling results, created internally for complex metrics +void output_group(const context_entry_t* entry, const char* label) { + const rocprofiler_group_t* group = &(entry->group); + context_entry_t group_entry = *entry; + for (unsigned i = 0; i < group->feature_count; ++i) { + if (group->features[i]->data.kind == ROCPROFILER_DATA_KIND_INT64) { + group_entry.features = group->features[i]; + group_entry.feature_count = 1; + output_results(&group_entry, label); + } + } +} + +// Dump stored context profiling output data +bool dump_context(context_entry_t* entry) { + hsa_status_t status = HSA_STATUS_ERROR; + + if (entry->valid == 0) return true; + + const rocprofiler_dispatch_record_t* record = entry->data.record; + if (record) { + if (record->complete == 0) { + return false; + } + } + + ++context_collected; + + const uint32_t index = entry->index; + FILE* file_handle = entry->file_handle; + const std::string nik_name = (to_truncate_names == 0) ? entry->data.kernel_name : filtr_kernel_name(entry->data.kernel_name); + + fprintf(file_handle, "dispatch[%u], queue_index(%lu), kernel_name(\"%s\")", + index, + entry->data.queue_index, + nik_name.c_str()); + if (record) fprintf(file_handle, ", time(%lu,%lu,%lu,%lu)", + record->dispatch, + record->begin, + record->end, + record->complete); + fprintf(file_handle, "\n"); + fflush(file_handle); + + if (record) { + delete record; + entry->data.record = NULL; + } + + rocprofiler_group_t& group = entry->group; + if (group.context != NULL) { + status = rocprofiler_group_get_data(&group); + check_status(status); + if (verbose == 1) output_group(entry, "group0-data"); + + status = rocprofiler_get_metrics(group.context); + check_status(status); + std::ostringstream oss; + oss << index << "__" << filtr_kernel_name(entry->data.kernel_name); + output_results(entry, oss.str().substr(0, KERNEL_NAME_LEN_MAX).c_str()); + free(const_cast(entry->data.kernel_name)); + + // Finishing cleanup + // Deleting profiling context will delete all allocated resources + rocprofiler_close(group.context); + } + + entry->valid = 0; + return true; +} + +// Dump and clean a given context entry +static inline bool dump_context_entry(context_entry_t* entry) { + const bool ret = dump_context(entry); + if (ret) dealloc_context_entry(entry); + return ret; +} + +// Dump waiting entries +static inline void dump_wait_list() { + if (pthread_mutex_lock(&mutex) != 0) { + perror("pthread_mutex_lock"); + abort(); + } + + auto it = wait_list->begin(); + auto end = wait_list->end(); + while (it != end) { + auto cur = it++; + if (dump_context_entry(*cur)) { + wait_list->erase(cur); + } + } + + if (pthread_mutex_unlock(&mutex) != 0) { + perror("pthread_mutex_unlock"); + abort(); + } +} + +// Dump all stored contexts profiling output data +void dump_context_array() { + if (pthread_mutex_lock(&mutex) != 0) { + perror("pthread_mutex_lock"); + abort(); + } + + if (context_array) { + if (!wait_list->empty()) dump_wait_list(); + + auto it = context_array->begin(); + auto end = context_array->end(); + while (it != end) { + auto cur = it++; + dump_context(&(cur->second)); + } + } + + if (pthread_mutex_unlock(&mutex) != 0) { + perror("pthread_mutex_unlock"); + abort(); + } +} + +// Profiling completion handler +bool handler(rocprofiler_group_t group, void* arg) { + context_entry_t* entry = reinterpret_cast(arg); + + if (pthread_mutex_lock(&mutex) != 0) { + perror("pthread_mutex_lock"); + abort(); + } + + if (!wait_list->empty()) dump_wait_list(); + + if (!dump_context_entry(entry)) { + wait_list->push_back(entry); + } + + if (trace_on) { + fprintf(stdout, "tool::handler: context_array %d tid %u\n", (int)(context_array->size()), GetTid()); + fflush(stdout); + } + + if (pthread_mutex_unlock(&mutex) != 0) { + perror("pthread_mutex_unlock"); + abort(); + } + + return false; +} + +bool check_filter(const rocprofiler_callback_data_t* callback_data, const callbacks_data_t* tool_data) { + bool found = true; + + std::vector* range_ptr = tool_data->range; + if (found && range_ptr) { + found = false; + std::vector& range = *range_ptr; + if (range.size() == 1) { + if (context_count >= range[0]) found = true; + } else if (range.size() == 2) { + if ((context_count >= range[0]) && (context_count < range[1])) found = true; + } + } + std::vector* gpu_index = tool_data->gpu_index; + if (found && gpu_index) { + found = false; + for (uint32_t i : *gpu_index) { + if (i == callback_data->agent_index) { + found = true; + } + } + } + std::vector* kernel_string = tool_data->kernel_string; + if (found && kernel_string) { + found = false; + for (const std::string& s : *kernel_string) { + if (std::string(callback_data->kernel_name).find(s) != std::string::npos) { + found = true; + } + } + } + + return found; +} + +// Kernel disoatch callback +hsa_status_t dispatch_callback(const rocprofiler_callback_data_t* callback_data, void* user_data, + rocprofiler_group_t* group) { + // Passed tool data + callbacks_data_t* tool_data = reinterpret_cast(user_data); + // HSA status + hsa_status_t status = HSA_STATUS_ERROR; + + // Checking dispatch condition + if (tool_data->filter_on == 1) { + if (check_filter(callback_data, tool_data) == false) { + next_context_count(); + return HSA_STATUS_SUCCESS; + } + } + // Profiling context + rocprofiler_t* context = NULL; + // Context entry + context_entry_t* entry = alloc_context_entry(); + // context properties + rocprofiler_properties_t properties{}; + properties.handler = (result_prefix != NULL) ? handler : NULL; + properties.handler_arg = (void*)entry; + + rocprofiler_feature_t* features = tool_data->features; + unsigned feature_count = tool_data->feature_count; + + if (tool_data->set != NULL) { + uint32_t set_offset = 0; + uint32_t next_offset = 0; + const auto entry_index = entry->index; + if (entry_index < (tool_data->set->size() - 1)) { + set_offset = (*(tool_data->set))[entry_index]; + next_offset = (*(tool_data->set))[entry_index + 1]; + } else { + set_offset = tool_data->set->back(); + next_offset = feature_count; + } + features += set_offset; + feature_count = next_offset - set_offset; + } + + if (tool_data->feature_count > 0) { + // Open profiling context + status = rocprofiler_open(callback_data->agent, features, feature_count, + &context, 0 /*ROCPROFILER_MODE_SINGLEGROUP*/, &properties); + check_status(status); + + // Check that we have only one profiling group + uint32_t group_count = 0; + status = rocprofiler_group_count(context, &group_count); + check_status(status); + assert(group_count == 1); + // Get group[0] + const uint32_t group_index = 0; + status = rocprofiler_get_group(context, group_index, group); + check_status(status); + } + + // Fill profiling context entry + entry->agent = callback_data->agent; + entry->group = *group; + entry->features = features; + entry->feature_count = feature_count; + entry->data = *callback_data; + entry->data.kernel_name = strdup(callback_data->kernel_name); + entry->file_handle = tool_data->file_handle; + entry->valid = 1; + + if (trace_on) { + fprintf(stdout, "tool::dispatch: context_array %d tid %u\n", (int)(context_array->size()), GetTid()); + fflush(stdout); + } + + return status; +} + +hsa_status_t destroy_callback(hsa_queue_t* queue, void*) { + if (result_file_opened == false) printf("\nROCProfiler results:\n"); + dump_context_array(); + return HSA_STATUS_SUCCESS; +} + +static hsa_status_t info_callback(const rocprofiler_info_data_t info, void * arg) { + const char symb = *reinterpret_cast(arg); + if (((symb == 'b') && (info.metric.expr == NULL)) || + ((symb == 'd') && (info.metric.expr != NULL))) + { + printf("\n gpu-agent%d : %s : %s\n", info.agent_index, info.metric.name, info.metric.description); + if (info.metric.expr != NULL) printf(" %s = %s\n", info.metric.name, info.metric.expr); + } + return HSA_STATUS_SUCCESS; +} + +std::string normalize_token(const std::string token, bool not_empty, std::string label) { + const std::string space_chars_set = " \t"; + const size_t first_pos = token.find_first_not_of(space_chars_set); + size_t norm_len = 0; + std::string error_str = "none"; + if (first_pos != std::string::npos) { + const size_t last_pos = token.find_last_not_of(space_chars_set); + if (last_pos == std::string::npos) error_str = "token string error: \"" + token + "\""; + else { + const size_t end_pos = last_pos + 1; + if (end_pos <= first_pos) error_str = "token string error: \"" + token + "\""; + else norm_len = end_pos - first_pos; + } + } + if (((first_pos != std::string::npos) && (norm_len == 0)) || + ((first_pos == std::string::npos) && not_empty)) { + fatal(label + ": " + error_str); + } + return (norm_len != 0) ? token.substr(first_pos, norm_len) : std::string(""); +} + +int get_xml_array(xml::Xml* xml, const std::string& tag, const std::string& field, const std::string& delim, std::vector* vec, const char* label = NULL) { + int parse_iter = 0; + auto nodes = xml->GetNodes(tag); + auto rit = nodes.rbegin(); + auto rend = nodes.rend(); + while (rit != rend) { + auto& opts = (*rit)->opts; + if (opts.find(field) != opts.end()) break; + ++rit; + } + if (rit != rend) { + const std::string array_string = (*rit)->opts[field]; + if (label != NULL) printf("%s%s = %s\n", label, field.c_str(), array_string.c_str()); + size_t pos1 = 0; + const size_t string_len = array_string.length(); + while (pos1 < string_len) { + const size_t pos2 = array_string.find(delim, pos1); + const bool found = (pos2 != std::string::npos); + const size_t token_len = (pos2 != std::string::npos) ? pos2 - pos1 : string_len - pos1; + const std::string token = array_string.substr(pos1, token_len); + const std::string norm_str = normalize_token(token, found, "Tokens array parsing error, file '" + xml->GetName() + "', " + tag + "::" + field); + if (norm_str.length() != 0) vec->push_back(norm_str); + if (!found) break; + pos1 = pos2 + 1; + ++parse_iter; + } + } + + return parse_iter; +} + +int get_xml_array(xml::Xml* xml, const std::string& tag, const std::string& field, const std::string& delim, std::vector* vec, const char* label = NULL) { + std::vector str_vec; + const int parse_iter = get_xml_array(xml, tag, field, delim, &str_vec, label); + for (const std::string& str : str_vec) vec->push_back(atoi(str.c_str())); + return parse_iter; +} + +static inline void check_env_var(const char* var_name, uint32_t& val) { + const char* str = getenv(var_name); + if (str != NULL ) val = atol(str); +} +static inline void check_env_var(const char* var_name, uint64_t& val) { + const char* str = getenv(var_name); + if (str != NULL ) val = atoll(str); +} + +// Tool constructor +extern "C" PUBLIC_API void OnLoadToolProp(rocprofiler_settings_t* settings) +{ + if (pthread_mutex_lock(&mutex) != 0) { + perror("pthread_mutex_lock"); + abort(); + } + if (is_loaded) return; + is_loaded = true; + if (pthread_mutex_unlock(&mutex) != 0) { + perror("pthread_mutex_unlock"); + abort(); + } + + // Loading configuration rcfile + std::string rcpath = std::string("./") + rcfile_name; + xml::Xml* rcfile = xml::Xml::Create(rcpath); + const char* home_dir = getenv("HOME"); + if (rcfile == NULL && home_dir != NULL) { + rcpath = std::string(home_dir) + "/" + rcfile_name; + rcfile = xml::Xml::Create(rcpath); + } + const char* pkg_dir = getenv("ROCP_PACKAGE_DIR"); + if (rcfile == NULL && pkg_dir != NULL) { + rcpath = std::string(pkg_dir) + "/" + rcfile_name; + rcfile = xml::Xml::Create(rcpath); + } + if (rcfile != NULL) { + // Getting defaults + printf("ROCProfiler: rc-file '%s'\n", rcpath.c_str()); + auto defaults_list = rcfile->GetNodes("top.defaults"); + for (auto* entry : defaults_list) { + const auto& opts = entry->opts; + auto it = opts.find("basenames"); + if (it != opts.end()) { to_truncate_names = (it->second == "on") ? 1 : 0; } + it = opts.find("timestamp"); + if (it != opts.end()) { settings->timestamp_on = (it->second == "on") ? 1 : 0; } + it = opts.find("ctx-limit"); + if (it != opts.end()) { CTX_OUTSTANDING_MAX = atol(it->second.c_str()); } + it = opts.find("heartbeat"); + if (it != opts.end()) { CTX_OUTSTANDING_MON = atol(it->second.c_str()); } + it = opts.find("sqtt-size"); + if (it != opts.end()) { + std::string str = normalize_token(it->second, true, "option sqtt-size"); + uint32_t multiplier = 1; + switch (str.back()) { + case 'K': multiplier = 1024; break; + case 'M': multiplier = 1024 * 1024; break; + } + if (multiplier != 1) str = str.substr(0, str.length() - 1); + settings->sqtt_size = strtoull(str.c_str(), NULL, 0) * multiplier; + } + it = opts.find("sqtt-local"); + if (it != opts.end()) { settings->sqtt_local = (it->second == "on"); } + } + } + // Enable verbose mode + check_env_var("ROCP_VERBOSE_MODE", verbose); + // Enable kernel names truncating + check_env_var("ROCP_TRUNCATE_NAMES", to_truncate_names); + // Set outstanding dispatches parameter + check_env_var("ROCP_OUTSTANDING_MAX", CTX_OUTSTANDING_MAX); + check_env_var("ROCP_OUTSTANDING_MON", CTX_OUTSTANDING_MON); + // Enable timestamping + check_env_var("ROCP_TIMESTAMP_ON", settings->timestamp_on); + // Set data timeout + check_env_var("ROCP_DATA_TIMEOUT", settings->timeout); + // Set SQTT size + check_env_var("ROCP_SQTT_SIZE", settings->sqtt_size); + // Set SQTT local buffer + check_env_var("ROCP_SQTT_LOCAL", settings->sqtt_local); + + is_sqtt_local = settings->sqtt_local; + + // Printing out info + char* info_symb = getenv("ROCP_INFO"); + if (info_symb != NULL) { + if (*info_symb != 'b' && *info_symb != 'd') { + fprintf(stderr, "ROCProfiler: bad info symbol '%c', ROCP_INFO env", *info_symb); + } else { + if (*info_symb == 'b') printf("Basic HW counters:\n"); + else printf("Derived metrics:\n"); + rocprofiler_iterate_info(NULL, ROCPROFILER_INFO_KIND_METRIC, info_callback, info_symb); + } + exit(1); + } + + // Set output file + result_prefix = getenv("ROCP_OUTPUT_DIR"); + if (result_prefix != NULL) { + DIR* dir = opendir(result_prefix); + if (dir == NULL) { + std::ostringstream errmsg; + errmsg << "ROCProfiler: Cannot open output directory '" << result_prefix << "'"; + perror(errmsg.str().c_str()); + abort(); + } + std::ostringstream oss; + oss << result_prefix << "/results.txt"; + result_file_handle = fopen(oss.str().c_str(), "w"); + if (result_file_handle == NULL) { + std::ostringstream errmsg; + errmsg << "ROCProfiler: fopen error, file '" << oss.str().c_str() << "'"; + perror(errmsg.str().c_str()); + abort(); + } + } else result_file_handle = stdout; + + result_file_opened = (result_prefix != NULL) && (result_file_handle != NULL); + + // Getting input + const char* xml_name = getenv("ROCP_INPUT"); + if (xml_name == NULL) fatal("ROCProfiler: input is not specified, ROCP_INPUT env"); + printf("ROCProfiler: input from \"%s\"\n", xml_name); + xml::Xml* xml = xml::Xml::Create(xml_name); + if (xml == NULL) { + fprintf(stderr, "ROCProfiler: Input file not found '%s'\n", xml_name); + abort(); + } + + // Getting metrics + std::vector metrics_vec; + get_xml_array(xml, "top.metric", "name", ",", &metrics_vec); + + // Metrics set + metrics_set = new std::vector; + get_xml_array(xml, "top.metric", "set", ",", metrics_set, " "); + if (metrics_set->size() != 0) { + uint32_t accum = 0; + metrics_set->insert(metrics_set->begin(), 0); + for (auto it = metrics_set->begin(); it != metrics_set->end(); ++it) { + accum += *it; + *it = accum; + } + } + + // Getting GPU indexes + gpu_index_vec = new std::vector; + get_xml_array(xml, "top.metric", "gpu_index", ",", gpu_index_vec, " "); + + // Getting kernel names + kernel_string_vec = new std::vector; + get_xml_array(xml, "top.metric", "kernel", ",", kernel_string_vec, " "); + + // Getting profiling range + range_vec = new std::vector; + const int range_parse_iter = get_xml_array(xml, "top.metric", "range", ":", range_vec, " "); + if ((range_vec->size() > 2) || (range_parse_iter > 1)) + { + fatal("Bad range format, input file " + xml->GetName()); + } + if ((range_vec->size() == 1) && (range_parse_iter == 0)) { + range_vec->push_back(*(range_vec->begin()) + 1); + } + + // Getting traces + auto traces_list = xml->GetNodes("top.trace"); + + const unsigned feature_count = metrics_vec.size() + traces_list.size(); + rocprofiler_feature_t* features = new rocprofiler_feature_t[feature_count]; + memset(features, 0, feature_count * sizeof(rocprofiler_feature_t)); + + printf(" %d metrics\n", (int)metrics_vec.size()); + for (unsigned i = 0; i < metrics_vec.size(); ++i) { + const std::string& name = metrics_vec[i]; + printf("%s%s", (i == 0) ? " " : ", ", name.c_str()); + features[i] = {}; + features[i].kind = ROCPROFILER_FEATURE_KIND_METRIC; + features[i].name = strdup(name.c_str()); + } + if (metrics_vec.size()) printf("\n"); + + printf(" %d traces\n", (int)traces_list.size()); + unsigned index = metrics_vec.size(); + for (auto* entry : traces_list) { + auto params_list = xml->GetNodes("top.trace.parameters"); + if (params_list.size() > 1) { + fatal("ROCProfiler: Single input 'parameters' section is supported"); + } + std::string name = ""; + bool to_copy_data = false; + for (const auto& opt : entry->opts) { + if (opt.first == "name") name = opt.second; + else if (opt.first == "copy") to_copy_data = (opt.second == "true"); + else fatal("ROCProfiler: Bad trace property '" + opt.first + "'"); + } + if (name == "") fatal("ROCProfiler: Bad trace properties, name is not specified"); + + std::map parameters_dict; + parameters_dict["TARGET_CU"] = + HSA_VEN_AMD_AQLPROFILE_PARAMETER_NAME_COMPUTE_UNIT_TARGET; + parameters_dict["VM_ID_MASK"] = + HSA_VEN_AMD_AQLPROFILE_PARAMETER_NAME_VM_ID_MASK; + parameters_dict["MASK"] = + HSA_VEN_AMD_AQLPROFILE_PARAMETER_NAME_MASK; + parameters_dict["TOKEN_MASK"] = + HSA_VEN_AMD_AQLPROFILE_PARAMETER_NAME_TOKEN_MASK; + parameters_dict["TOKEN_MASK2"] = + HSA_VEN_AMD_AQLPROFILE_PARAMETER_NAME_TOKEN_MASK2; + parameters_dict["SE_MASK"] = + HSA_VEN_AMD_AQLPROFILE_PARAMETER_NAME_SE_MASK; + + printf(" %s (", name.c_str()); + features[index] = {}; + features[index].kind = ROCPROFILER_FEATURE_KIND_TRACE; + features[index].name = strdup(name.c_str()); + features[index].data.result_bytes.copy = to_copy_data; + + for (auto* params : params_list) { + const unsigned parameter_count = params->opts.size(); + rocprofiler_parameter_t* parameters = new rocprofiler_parameter_t[parameter_count]; + unsigned p_index = 0; + for (auto& v : params->opts) { + const std::string parameter_name = v.first; + if (parameters_dict.find(parameter_name) == parameters_dict.end()) { + fprintf(stderr, "ROCProfiler: unknown trace parameter '%s'\n", parameter_name.c_str()); + abort(); + } + const uint32_t value = strtol(v.second.c_str(), NULL, 0); + printf("\n %s = 0x%x", parameter_name.c_str(), value); + parameters[p_index] = {}; + parameters[p_index].parameter_name = parameters_dict[parameter_name]; + parameters[p_index].value = value; + ++p_index; + } + + features[index].parameters = parameters; + features[index].parameter_count = parameter_count; + } + if (params_list.empty() == false) printf("\n "); + printf(")\n"); + fflush(stdout); + ++index; + } + fflush(stdout); + + // Context array aloocation + context_array = new context_array_t; + wait_list = new wait_list_t; + + // Adding dispatch observer + rocprofiler_queue_callbacks_t callbacks_ptrs{0}; + callbacks_ptrs.dispatch = dispatch_callback; + callbacks_ptrs.destroy = destroy_callback; + + callbacks_data = new callbacks_data_t{}; + callbacks_data->features = features; + callbacks_data->feature_count = feature_count; + callbacks_data->set = (metrics_set->empty()) ? NULL : metrics_set; + callbacks_data->group_index = 0; + callbacks_data->file_handle = result_file_handle; + callbacks_data->gpu_index = (gpu_index_vec->empty()) ? NULL : gpu_index_vec; + callbacks_data->kernel_string = (kernel_string_vec->empty()) ? NULL : kernel_string_vec; + callbacks_data->range = (range_vec->empty()) ? NULL : range_vec;; + callbacks_data->filter_on = (callbacks_data->gpu_index != NULL) || + (callbacks_data->kernel_string != NULL) || + (callbacks_data->range != NULL) + ? 1 : 0; + + rocprofiler_set_queue_callbacks(callbacks_ptrs, callbacks_data); + + xml::Xml::Destroy(xml); + + if (CTX_OUTSTANDING_MON != 0) { + pthread_t thread; + pthread_attr_t attr; + int err = pthread_attr_init(&attr); + if (err) { errno = err; perror("pthread_attr_init"); abort(); } + err = pthread_create(&thread, &attr, monitor_thr_fun, NULL); + } +} + +// Tool destructor +extern "C" PUBLIC_API void OnUnloadTool() { + if (pthread_mutex_lock(&mutex) != 0) { + perror("pthread_mutex_lock"); + abort(); + } + if (!is_loaded) return; + is_loaded = false; + if (pthread_mutex_unlock(&mutex) != 0) { + perror("pthread_mutex_unlock"); + abort(); + } + + // Unregister dispatch callback + rocprofiler_remove_queue_callbacks(); + + // Dump stored profiling output data + printf("\nROCPRofiler: %u contexts collected", context_collected); + if (result_file_opened) printf(", output directory %s", result_prefix); + printf("\n"); fflush(stdout); + dump_context_array(); + if (wait_list) { + if (!wait_list->empty()) { + printf("\nWaiting for pending kernels ..."); fflush(stdout); + while (wait_list->size() != 0) { + usleep(1000); + dump_wait_list(); + } + printf(".done\n"); fflush(stdout); + } + } + if (result_file_opened) fclose(result_file_handle); + + // Cleanup + if (callbacks_data != NULL) { + delete[] callbacks_data->features; + delete callbacks_data; + callbacks_data = NULL; + } + delete metrics_set; + metrics_set = NULL; + delete gpu_index_vec; + gpu_index_vec = NULL; + delete kernel_string_vec; + kernel_string_vec = NULL; + delete range_vec; + range_vec = NULL; + delete context_array; + context_array = NULL; + delete wait_list; + wait_list = NULL; +} + +extern "C" DESTRUCTOR_API void destructor() { + if (is_loaded == true) OnUnloadTool(); +} diff --git a/test/util/helper_funcs.h b/test/util/helper_funcs.h new file mode 100644 index 00000000..c76854ba --- /dev/null +++ b/test/util/helper_funcs.h @@ -0,0 +1,86 @@ +/****************************************************************************** +Copyright (c) 2018 Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*******************************************************************************/ + +#ifndef TEST_UTIL_HELPER_FUNCS_H_ +#define TEST_UTIL_HELPER_FUNCS_H_ + +#include +#include +#include +#include +#include + +static inline void Error(std::string error_msg) { + std::cerr << "Error: " << error_msg << std::endl; +} + +template +void PrintArray(const std::string header, const T* data, const int width, const int height) { + std::clog << header << " :\n"; + for (int i = 0; i < height; i++) { + std::clog << "> "; + for (int j = 0; j < width; j++) { + std::clog << data[i * width + j] << " "; + } + std::clog << "\n"; + } +} + +template +bool FillRandom(T* array_ptr, const int width, const int height, const T range_min, + const T range_max, unsigned int seed = 123) { + if (!array_ptr) { + Error("Cannot fill array. NULL pointer."); + return false; + } + + if (!seed) seed = (unsigned int)time(NULL); + + srand(seed); + double range = double(range_max - range_min) + 1.0; + + /* random initialisation of input */ + for (int i = 0; i < height; i++) + for (int j = 0; j < width; j++) { + int index = i * width + j; + array_ptr[index] = range_min + T(range * rand() / (RAND_MAX + 1.0)); + } + + return true; +} + +template T RoundToPowerOf2(T val) { + int bytes = sizeof(T); + + val--; + for (int i = 0; i < bytes; i++) val |= val >> (1 << i); + val++; + + return val; +} + +template bool IsPowerOf2(T val) { + long long long_val = val; + return (((long_val & (-long_val)) - long_val == 0) && (long_val != 0)); +} + +#endif // TEST_UTIL_HELPER_FUNCS_H_ diff --git a/test/util/hsa_rsrc_factory.cpp b/test/util/hsa_rsrc_factory.cpp new file mode 100644 index 00000000..5116a3a8 --- /dev/null +++ b/test/util/hsa_rsrc_factory.cpp @@ -0,0 +1,556 @@ +/****************************************************************************** +Copyright (c) 2018 Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*******************************************************************************/ + +#include "util/hsa_rsrc_factory.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +// Callback function to get available in the system agents +hsa_status_t HsaRsrcFactory::GetHsaAgentsCallback(hsa_agent_t agent, void* data) { + hsa_status_t status = HSA_STATUS_ERROR; + HsaRsrcFactory* hsa_rsrc = reinterpret_cast(data); + const AgentInfo* agent_info = hsa_rsrc->AddAgentInfo(agent); + if (agent_info != NULL) status = HSA_STATUS_SUCCESS; + return status; +} + +// This function checks to see if the provided +// pool has the HSA_AMD_SEGMENT_GLOBAL property. If the kern_arg flag is true, +// the function adds an additional requirement that the pool have the +// HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_KERNARG_INIT property. If kern_arg is false, +// pools must NOT have this property. +// Upon finding a pool that meets these conditions, HSA_STATUS_INFO_BREAK is +// returned. HSA_STATUS_SUCCESS is returned if no errors were encountered, but +// no pool was found meeting the requirements. If an error is encountered, we +// return that error. +static hsa_status_t FindGlobalPool(hsa_amd_memory_pool_t pool, void* data, bool kern_arg) { + hsa_status_t err; + hsa_amd_segment_t segment; + uint32_t flag; + + if (nullptr == data) { + return HSA_STATUS_ERROR_INVALID_ARGUMENT; + } + + err = hsa_amd_memory_pool_get_info(pool, HSA_AMD_MEMORY_POOL_INFO_SEGMENT, &segment); + CHECK_STATUS("hsa_amd_memory_pool_get_info", err); + if (HSA_AMD_SEGMENT_GLOBAL != segment) { + return HSA_STATUS_SUCCESS; + } + + err = hsa_amd_memory_pool_get_info(pool, HSA_AMD_MEMORY_POOL_INFO_GLOBAL_FLAGS, &flag); + CHECK_STATUS("hsa_amd_memory_pool_get_info", err); + + uint32_t karg_st = flag & HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_KERNARG_INIT; + + if ((karg_st == 0 && kern_arg) || (karg_st != 0 && !kern_arg)) { + return HSA_STATUS_SUCCESS; + } + + *(reinterpret_cast(data)) = pool; + return HSA_STATUS_INFO_BREAK; +} + +// This is the call-back function for hsa_amd_agent_iterate_memory_pools() that +// finds a pool with the properties of HSA_AMD_SEGMENT_GLOBAL and that is NOT +// HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_KERNARG_INIT +hsa_status_t FindStandardPool(hsa_amd_memory_pool_t pool, void* data) { + return FindGlobalPool(pool, data, false); +} + +// This is the call-back function for hsa_amd_agent_iterate_memory_pools() that +// finds a pool with the properties of HSA_AMD_SEGMENT_GLOBAL and that IS +// HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_KERNARG_INIT +hsa_status_t FindKernArgPool(hsa_amd_memory_pool_t pool, void* data) { + return FindGlobalPool(pool, data, true); +} + +// Constructor of the class +HsaRsrcFactory::HsaRsrcFactory(bool initialize_hsa) : initialize_hsa_(initialize_hsa) { + hsa_status_t status; + // Initialize the Hsa Runtime + if (initialize_hsa_) { + status = hsa_init(); + CHECK_STATUS("Error in hsa_init", status); + } + // Discover the set of Gpu devices available on the platform + status = hsa_iterate_agents(GetHsaAgentsCallback, this); + CHECK_STATUS("Error Calling hsa_iterate_agents", status); + + // Get AqlProfile API table + aqlprofile_api_ = {0}; +#ifdef ROCP_LD_AQLPROFILE + status = LoadAqlProfileLib(&aqlprofile_api_); +#else + status = hsa_system_get_extension_table(HSA_EXTENSION_AMD_AQLPROFILE, 1, 0, &aqlprofile_api_); +#endif + CHECK_STATUS("aqlprofile API table load failed", status); + + // Get Loader API table + loader_api_ = {0}; + status = hsa_system_get_extension_table(HSA_EXTENSION_AMD_LOADER, 1, 0, &loader_api_); + CHECK_STATUS("loader API table query failed", status); +} + +// Destructor of the class +HsaRsrcFactory::~HsaRsrcFactory() { + for (auto p : cpu_list_) delete p; + for (auto p : gpu_list_) delete p; + if (initialize_hsa_) { + hsa_status_t status = hsa_shut_down(); + CHECK_STATUS("Error in hsa_shut_down", status); + } +} + +hsa_status_t HsaRsrcFactory::LoadAqlProfileLib(aqlprofile_pfn_t* api) { + void* handle = dlopen(kAqlProfileLib, RTLD_NOW); + if (handle == NULL) { + fprintf(stderr, "Loading '%s' failed, %s\n", kAqlProfileLib, dlerror()); + return HSA_STATUS_ERROR; + } + dlerror(); /* Clear any existing error */ + + api->hsa_ven_amd_aqlprofile_error_string = + (decltype(::hsa_ven_amd_aqlprofile_error_string)*)dlsym( + handle, "hsa_ven_amd_aqlprofile_error_string"); + api->hsa_ven_amd_aqlprofile_validate_event = + (decltype(::hsa_ven_amd_aqlprofile_validate_event)*)dlsym( + handle, "hsa_ven_amd_aqlprofile_validate_event"); + api->hsa_ven_amd_aqlprofile_start = + (decltype(::hsa_ven_amd_aqlprofile_start)*)dlsym(handle, "hsa_ven_amd_aqlprofile_start"); + api->hsa_ven_amd_aqlprofile_stop = + (decltype(::hsa_ven_amd_aqlprofile_stop)*)dlsym(handle, "hsa_ven_amd_aqlprofile_stop"); + api->hsa_ven_amd_aqlprofile_read = + (decltype(::hsa_ven_amd_aqlprofile_read)*)dlsym(handle, "hsa_ven_amd_aqlprofile_read"); + api->hsa_ven_amd_aqlprofile_legacy_get_pm4 = + (decltype(::hsa_ven_amd_aqlprofile_legacy_get_pm4)*)dlsym( + handle, "hsa_ven_amd_aqlprofile_legacy_get_pm4"); + api->hsa_ven_amd_aqlprofile_get_info = (decltype(::hsa_ven_amd_aqlprofile_get_info)*)dlsym( + handle, "hsa_ven_amd_aqlprofile_get_info"); + api->hsa_ven_amd_aqlprofile_iterate_data = + (decltype(::hsa_ven_amd_aqlprofile_iterate_data)*)dlsym( + handle, "hsa_ven_amd_aqlprofile_iterate_data"); + + return HSA_STATUS_SUCCESS; +} + +// Add system agent info +const AgentInfo* HsaRsrcFactory::AddAgentInfo(const hsa_agent_t agent) { + // Determine if device is a Gpu agent + hsa_status_t status; + AgentInfo* agent_info = NULL; + + hsa_device_type_t type; + status = hsa_agent_get_info(agent, HSA_AGENT_INFO_DEVICE, &type); + CHECK_STATUS("Error Calling hsa_agent_get_info", status); + + if (type == HSA_DEVICE_TYPE_CPU) { + agent_info = new AgentInfo{}; + agent_info->dev_id = agent; + agent_info->dev_type = HSA_DEVICE_TYPE_CPU; + agent_info->dev_index = cpu_list_.size(); + + status = hsa_amd_agent_iterate_memory_pools(agent, FindStandardPool, &agent_info->cpu_pool); + CHECK_ITER_STATUS("hsa_amd_agent_iterate_memory_pools(cpu pool)", status); + status = hsa_amd_agent_iterate_memory_pools(agent, FindKernArgPool, &agent_info->kern_arg_pool); + CHECK_ITER_STATUS("hsa_amd_agent_iterate_memory_pools(kern arg pool)", status); + agent_info->gpu_pool = {}; + + cpu_list_.push_back(agent_info); + cpu_agents_.push_back(agent); + } + + if (type == HSA_DEVICE_TYPE_GPU) { + agent_info = new AgentInfo{}; + agent_info->dev_id = agent; + agent_info->dev_type = HSA_DEVICE_TYPE_GPU; + hsa_agent_get_info(agent, HSA_AGENT_INFO_NAME, agent_info->name); + strncpy(agent_info->gfxip, agent_info->name, 4); + agent_info->gfxip[4] = '\0'; + hsa_agent_get_info(agent, HSA_AGENT_INFO_WAVEFRONT_SIZE, &agent_info->max_wave_size); + hsa_agent_get_info(agent, HSA_AGENT_INFO_QUEUE_MAX_SIZE, &agent_info->max_queue_size); + hsa_agent_get_info(agent, HSA_AGENT_INFO_PROFILE, &agent_info->profile); + agent_info->is_apu = (agent_info->profile == HSA_PROFILE_FULL) ? true : false; + hsa_agent_get_info(agent, static_cast(HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT), + &agent_info->cu_num); + hsa_agent_get_info(agent, static_cast(HSA_AMD_AGENT_INFO_MAX_WAVES_PER_CU), + &agent_info->waves_per_cu); + hsa_agent_get_info(agent, static_cast(HSA_AMD_AGENT_INFO_NUM_SIMDS_PER_CU), + &agent_info->simds_per_cu); + hsa_agent_get_info(agent, static_cast(HSA_AMD_AGENT_INFO_NUM_SHADER_ENGINES), + &agent_info->se_num); + hsa_agent_get_info(agent, + static_cast(HSA_AMD_AGENT_INFO_NUM_SHADER_ARRAYS_PER_SE), + &agent_info->shader_arrays_per_se); + + agent_info->cpu_pool = {}; + agent_info->kern_arg_pool = {}; + status = hsa_amd_agent_iterate_memory_pools(agent, FindStandardPool, &agent_info->gpu_pool); + CHECK_ITER_STATUS("hsa_amd_agent_iterate_memory_pools(gpu pool)", status); + + // Set GPU index + agent_info->dev_index = gpu_list_.size(); + gpu_list_.push_back(agent_info); + gpu_agents_.push_back(agent); + } + + if (agent_info) agent_map_[agent.handle] = agent_info; + + return agent_info; +} + +// Return systen agent info +const AgentInfo* HsaRsrcFactory::GetAgentInfo(const hsa_agent_t agent) { + const AgentInfo* agent_info = NULL; + auto it = agent_map_.find(agent.handle); + if (it != agent_map_.end()) { + agent_info = it->second; + } + return agent_info; +} + +// Get the count of Hsa Gpu Agents available on the platform +// +// @return uint32_t Number of Gpu agents on platform +// +uint32_t HsaRsrcFactory::GetCountOfGpuAgents() { return uint32_t(gpu_list_.size()); } + +// Get the count of Hsa Cpu Agents available on the platform +// +// @return uint32_t Number of Cpu agents on platform +// +uint32_t HsaRsrcFactory::GetCountOfCpuAgents() { return uint32_t(cpu_list_.size()); } + +// Get the AgentInfo handle of a Gpu device +// +// @param idx Gpu Agent at specified index +// +// @param agent_info Output parameter updated with AgentInfo +// +// @return bool true if successful, false otherwise +// +bool HsaRsrcFactory::GetGpuAgentInfo(uint32_t idx, const AgentInfo** agent_info) { + // Determine if request is valid + uint32_t size = uint32_t(gpu_list_.size()); + if (idx >= size) { + return false; + } + + // Copy AgentInfo from specified index + *agent_info = gpu_list_[idx]; + + return true; +} + +// Get the AgentInfo handle of a Cpu device +// +// @param idx Cpu Agent at specified index +// +// @param agent_info Output parameter updated with AgentInfo +// +// @return bool true if successful, false otherwise +// +bool HsaRsrcFactory::GetCpuAgentInfo(uint32_t idx, const AgentInfo** agent_info) { + // Determine if request is valid + uint32_t size = uint32_t(cpu_list_.size()); + if (idx >= size) { + return false; + } + + // Copy AgentInfo from specified index + *agent_info = cpu_list_[idx]; + return true; +} + +// Create a Queue object and return its handle. The queue object is expected +// to support user requested number of Aql dispatch packets. +// +// @param agent_info Gpu Agent on which to create a queue object +// +// @param num_Pkts Number of packets to be held by queue +// +// @param queue Output parameter updated with handle of queue object +// +// @return bool true if successful, false otherwise +// +bool HsaRsrcFactory::CreateQueue(const AgentInfo* agent_info, uint32_t num_pkts, + hsa_queue_t** queue) { + hsa_status_t status; + status = hsa_queue_create(agent_info->dev_id, num_pkts, HSA_QUEUE_TYPE_MULTI, NULL, NULL, + UINT32_MAX, UINT32_MAX, queue); + return (status == HSA_STATUS_SUCCESS); +} + +// Create a Signal object and return its handle. +// @param value Initial value of signal object +// @param signal Output parameter updated with handle of signal object +// @return bool true if successful, false otherwise +bool HsaRsrcFactory::CreateSignal(uint32_t value, hsa_signal_t* signal) { + hsa_status_t status; + status = hsa_signal_create(value, 0, NULL, signal); + return (status == HSA_STATUS_SUCCESS); +} + +// Allocate memory for use by a kernel of specified size in specified +// agent's memory region. +// @param agent_info Agent from whose memory region to allocate +// @param size Size of memory in terms of bytes +// @return uint8_t* Pointer to buffer, null if allocation fails. +uint8_t* HsaRsrcFactory::AllocateLocalMemory(const AgentInfo* agent_info, size_t size) { + hsa_status_t status = HSA_STATUS_ERROR; + uint8_t* buffer = NULL; + size = (size + MEM_PAGE_MASK) & ~MEM_PAGE_MASK; + status = hsa_amd_memory_pool_allocate(agent_info->gpu_pool, size, 0, reinterpret_cast(&buffer)); + uint8_t* ptr = (status == HSA_STATUS_SUCCESS) ? buffer : NULL; + return ptr; +} + +// Allocate memory to pass kernel parameters. +// Memory is alocated accessible for all CPU agents and for GPU given by AgentInfo parameter. +// @param agent_info Agent from whose memory region to allocate +// @param size Size of memory in terms of bytes +// @return uint8_t* Pointer to buffer, null if allocation fails. +uint8_t* HsaRsrcFactory::AllocateKernArgMemory(const AgentInfo* agent_info, size_t size) { + hsa_status_t status = HSA_STATUS_ERROR; + uint8_t* buffer = NULL; + if (!cpu_agents_.empty()) { + size = (size + MEM_PAGE_MASK) & ~MEM_PAGE_MASK; + status = hsa_amd_memory_pool_allocate(cpu_list_[0]->kern_arg_pool, size, 0, reinterpret_cast(&buffer)); + // Both the CPU and GPU can access the kernel arguments + if (status == HSA_STATUS_SUCCESS) { + hsa_agent_t ag_list[1] = {agent_info->dev_id}; + status = hsa_amd_agents_allow_access(1, ag_list, NULL, buffer); + } + } + uint8_t* ptr = (status == HSA_STATUS_SUCCESS) ? buffer : NULL; + return ptr; +} + +// Allocate system memory accessible by both CPU and GPU +// @param agent_info Agent from whose memory region to allocate +// @param size Size of memory in terms of bytes +// @return uint8_t* Pointer to buffer, null if allocation fails. +uint8_t* HsaRsrcFactory::AllocateSysMemory(const AgentInfo* agent_info, size_t size) { + hsa_status_t status = HSA_STATUS_ERROR; + uint8_t* buffer = NULL; + size = (size + MEM_PAGE_MASK) & ~MEM_PAGE_MASK; + if (!cpu_agents_.empty()) { + status = hsa_amd_memory_pool_allocate(cpu_list_[0]->cpu_pool, size, 0, reinterpret_cast(&buffer)); + // Both the CPU and GPU can access the memory + if (status == HSA_STATUS_SUCCESS) { + hsa_agent_t ag_list[1] = {agent_info->dev_id}; + status = hsa_amd_agents_allow_access(1, ag_list, NULL, buffer); + } + } + uint8_t* ptr = (status == HSA_STATUS_SUCCESS) ? buffer : NULL; + return ptr; +} + +// Allocate memory for command buffer. +// @param agent_info Agent from whose memory region to allocate +// @param size Size of memory in terms of bytes +// @return uint8_t* Pointer to buffer, null if allocation fails. +uint8_t* HsaRsrcFactory::AllocateCmdMemory(const AgentInfo* agent_info, size_t size) { + size = (size + MEM_PAGE_MASK) & ~MEM_PAGE_MASK; + uint8_t* ptr = (agent_info->is_apu && CMD_MEMORY_MMAP) + ? reinterpret_cast( + mmap(NULL, size, PROT_READ | PROT_WRITE | PROT_EXEC, MAP_SHARED | MAP_ANONYMOUS, 0, 0)) + : AllocateSysMemory(agent_info, size); + return ptr; +} + +// Copy data from GPU to host memory +bool HsaRsrcFactory::Memcpy(const hsa_agent_t& agent, void* dst, const void* src, size_t size) { + hsa_status_t status = HSA_STATUS_ERROR; + if (!cpu_agents_.empty()) { + hsa_signal_t s = {}; + status = hsa_signal_create(1, 0, NULL, &s); + if (status == HSA_STATUS_SUCCESS) { + status = hsa_amd_memory_async_copy(dst, cpu_agents_[0], src, agent, size, 0, NULL, s); + if (status == HSA_STATUS_SUCCESS) { + if (hsa_signal_wait_scacquire(s, HSA_SIGNAL_CONDITION_LT, 1, UINT64_MAX, + HSA_WAIT_STATE_BLOCKED) != 0) { + status = HSA_STATUS_ERROR; + } + } + status = hsa_signal_destroy(s); + } + } + return (status == HSA_STATUS_SUCCESS); +} +bool HsaRsrcFactory::Memcpy(const AgentInfo* agent_info, void* dst, const void* src, size_t size) { + return Memcpy(agent_info->dev_id, dst, src, size); +} + +// Memory free method +bool HsaRsrcFactory::FreeMemory(void* ptr) { + const hsa_status_t status = hsa_memory_free(ptr); + CHECK_STATUS("hsa_memory_free", status); + return (status == HSA_STATUS_SUCCESS); +} + +// Loads an Assembled Brig file and Finalizes it into Device Isa +// @param agent_info Gpu device for which to finalize +// @param brig_path File path of the Assembled Brig file +// @param kernel_name Name of the kernel to finalize +// @param code_desc Handle of finalized Code Descriptor that could +// be used to submit for execution +// @return bool true if successful, false otherwise +bool HsaRsrcFactory::LoadAndFinalize(const AgentInfo* agent_info, const char* brig_path, + const char* kernel_name, hsa_executable_t* executable, + hsa_executable_symbol_t* code_desc) { + hsa_status_t status = HSA_STATUS_ERROR; + + // Build the code object filename + std::string filename(brig_path); + std::clog << "Code object filename: " << filename << std::endl; + + // Open the file containing code object + hsa_file_t file_handle = open(filename.c_str(), O_RDONLY); + if (file_handle == -1) { + std::cerr << "Error: failed to load '" << filename << "'" << std::endl; + assert(false); + return false; + } + + // Create code object reader + hsa_code_object_reader_t code_obj_rdr = {0}; + status = hsa_code_object_reader_create_from_file(file_handle, &code_obj_rdr); + if (status != HSA_STATUS_SUCCESS) { + std::cerr << "Failed to create code object reader '" << filename << "'" << std::endl; + return false; + } + + // Create executable. + status = hsa_executable_create_alt(HSA_PROFILE_FULL, HSA_DEFAULT_FLOAT_ROUNDING_MODE_DEFAULT, + NULL, executable); + CHECK_STATUS("Error in creating executable object", status); + + // Load code object. + status = hsa_executable_load_agent_code_object(*executable, agent_info->dev_id, code_obj_rdr, + NULL, NULL); + CHECK_STATUS("Error in loading executable object", status); + + // Freeze executable. + status = hsa_executable_freeze(*executable, ""); + CHECK_STATUS("Error in freezing executable object", status); + + // Get symbol handle. + hsa_executable_symbol_t kernelSymbol; + status = hsa_executable_get_symbol(*executable, NULL, kernel_name, agent_info->dev_id, 0, + &kernelSymbol); + CHECK_STATUS("Error in looking up kernel symbol", status); + + // Update output parameter + *code_desc = kernelSymbol; + return true; +} + +// Print the various fields of Hsa Gpu Agents +bool HsaRsrcFactory::PrintGpuAgents(const std::string& header) { + std::clog << header << " :" << std::endl; + + const AgentInfo* agent_info; + int size = uint32_t(gpu_list_.size()); + for (int idx = 0; idx < size; idx++) { + agent_info = gpu_list_[idx]; + + std::clog << "> agent[" << idx << "] :" << std::endl; + std::clog << ">> Name : " << agent_info->name << std::endl; + std::clog << ">> APU : " << agent_info->is_apu << std::endl; + std::clog << ">> HSAIL profile : " << agent_info->profile << std::endl; + std::clog << ">> Max Wave Size : " << agent_info->max_wave_size << std::endl; + std::clog << ">> Max Queue Size : " << agent_info->max_queue_size << std::endl; + std::clog << ">> CU number : " << agent_info->cu_num << std::endl; + std::clog << ">> Waves per CU : " << agent_info->waves_per_cu << std::endl; + std::clog << ">> SIMDs per CU : " << agent_info->simds_per_cu << std::endl; + std::clog << ">> SE number : " << agent_info->se_num << std::endl; + std::clog << ">> Shader Arrays per SE : " << agent_info->shader_arrays_per_se << std::endl; + } + return true; +} + +uint64_t HsaRsrcFactory::Submit(hsa_queue_t* queue, const void* packet) { + const uint32_t slot_size_b = 0x40; + + // adevance command queue + const uint64_t write_idx = hsa_queue_load_write_index_relaxed(queue); + hsa_queue_store_write_index_relaxed(queue, write_idx + 1); + while ((write_idx - hsa_queue_load_read_index_relaxed(queue)) >= queue->size) { + sched_yield(); + } + + uint32_t slot_idx = (uint32_t)(write_idx % queue->size); + uint32_t* queue_slot = reinterpret_cast((uintptr_t)(queue->base_address) + (slot_idx * slot_size_b)); + const uint32_t* slot_data = reinterpret_cast(packet); + + // Copy buffered commands into the queue slot. + // Overwrite the AQL invalid header (first dword) last. + // This prevents the slot from being read until it's fully written. + memcpy(&queue_slot[1], &slot_data[1], slot_size_b - sizeof(uint32_t)); + std::atomic* header_atomic_ptr = + reinterpret_cast*>(&queue_slot[0]); + header_atomic_ptr->store(slot_data[0], std::memory_order_release); + + // ringdoor bell + hsa_signal_store_relaxed(queue->doorbell_signal, write_idx); + + return write_idx; +} + +uint64_t HsaRsrcFactory::Submit(hsa_queue_t* queue, const void* packet, size_t size_bytes) { + const uint32_t slot_size_b = 0x40; + if ((size_bytes & (slot_size_b - 1)) != 0) { + fprintf(stderr, "HsaRsrcFactory::Submit: Bad packet size %zx\n", size_bytes); + abort(); + } + + const char* begin = reinterpret_cast(packet); + const char* end = begin + size_bytes; + uint64_t write_idx = 0; + for (const char* ptr = begin; ptr < end; ptr += slot_size_b) { + write_idx = Submit(queue, ptr); + } + + return write_idx; +} + +HsaRsrcFactory* HsaRsrcFactory::instance_ = NULL; +HsaRsrcFactory::mutex_t HsaRsrcFactory::mutex_; diff --git a/test/util/hsa_rsrc_factory.h b/test/util/hsa_rsrc_factory.h new file mode 100644 index 00000000..e7dcc559 --- /dev/null +++ b/test/util/hsa_rsrc_factory.h @@ -0,0 +1,284 @@ +/****************************************************************************** +Copyright (c) 2018 Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*******************************************************************************/ + +#ifndef TEST_UTIL_HSA_RSRC_FACTORY_H_ +#define TEST_UTIL_HSA_RSRC_FACTORY_H_ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +#define HSA_ARGUMENT_ALIGN_BYTES 16 +#define HSA_QUEUE_ALIGN_BYTES 64 +#define HSA_PACKET_ALIGN_BYTES 64 + +#define CHECK_STATUS(msg, status) \ + if (status != HSA_STATUS_SUCCESS) { \ + const char* emsg = 0; \ + hsa_status_string(status, &emsg); \ + printf("%s: %s\n", msg, emsg ? emsg : ""); \ + exit(1); \ + } + +#define CHECK_ITER_STATUS(msg, status) \ + if (status != HSA_STATUS_INFO_BREAK) { \ + const char* emsg = 0; \ + hsa_status_string(status, &emsg); \ + printf("%s: %s\n", msg, emsg ? emsg : ""); \ + exit(1); \ + } + +static const size_t MEM_PAGE_BYTES = 0x1000; +static const size_t MEM_PAGE_MASK = MEM_PAGE_BYTES - 1; +typedef decltype(hsa_agent_t::handle) hsa_agent_handle_t; + +// Encapsulates information about a Hsa Agent such as its +// handle, name, max queue size, max wavefront size, etc. +struct AgentInfo { + // Handle of Agent + hsa_agent_t dev_id; + + // Agent type - Cpu = 0, Gpu = 1 or Dsp = 2 + uint32_t dev_type; + + // APU flag + bool is_apu; + + // Agent system index + uint32_t dev_index; + + // GFXIP name + char gfxip[64]; + + // Name of Agent whose length is less than 64 + char name[64]; + + // Max size of Wavefront size + uint32_t max_wave_size; + + // Max size of Queue buffer + uint32_t max_queue_size; + + // Hsail profile supported by agent + hsa_profile_t profile; + + // CPU/GPU/kern-arg memory pools + hsa_amd_memory_pool_t cpu_pool; + hsa_amd_memory_pool_t gpu_pool; + hsa_amd_memory_pool_t kern_arg_pool; + + // The number of compute unit available in the agent. + uint32_t cu_num; + + // Maximum number of waves possible in a Compute Unit. + uint32_t waves_per_cu; + + // Number of SIMD's per compute unit CU + uint32_t simds_per_cu; + + // Number of Shader Engines (SE) in Gpu + uint32_t se_num; + + // Number of Shader Arrays Per Shader Engines in Gpu + uint32_t shader_arrays_per_se; +}; + +class HsaRsrcFactory { + public: + typedef std::recursive_mutex mutex_t; + + static HsaRsrcFactory* Create(bool initialize_hsa = true) { + std::lock_guard lck(mutex_); + if (instance_ == NULL) { + instance_ = new HsaRsrcFactory(initialize_hsa); + } + return instance_; + } + + static HsaRsrcFactory& Instance() { + if (instance_ == NULL) instance_ = Create(false); + hsa_status_t status = (instance_ != NULL) ? HSA_STATUS_SUCCESS : HSA_STATUS_ERROR; + CHECK_STATUS("HsaRsrcFactory::Instance() failed", status); + return *instance_; + } + + static void Destroy() { + std::lock_guard lck(mutex_); + if (instance_) delete instance_; + instance_ = NULL; + } + + // Return system agent info + const AgentInfo* GetAgentInfo(const hsa_agent_t agent); + + // Get the count of Hsa Gpu Agents available on the platform + // @return uint32_t Number of Gpu agents on platform + uint32_t GetCountOfGpuAgents(); + + // Get the count of Hsa Cpu Agents available on the platform + // @return uint32_t Number of Cpu agents on platform + uint32_t GetCountOfCpuAgents(); + + // Get the AgentInfo handle of a Gpu device + // @param idx Gpu Agent at specified index + // @param agent_info Output parameter updated with AgentInfo + // @return bool true if successful, false otherwise + bool GetGpuAgentInfo(uint32_t idx, const AgentInfo** agent_info); + + // Get the AgentInfo handle of a Cpu device + // @param idx Cpu Agent at specified index + // @param agent_info Output parameter updated with AgentInfo + // @return bool true if successful, false otherwise + bool GetCpuAgentInfo(uint32_t idx, const AgentInfo** agent_info); + + // Create a Queue object and return its handle. The queue object is expected + // to support user requested number of Aql dispatch packets. + // @param agent_info Gpu Agent on which to create a queue object + // @param num_Pkts Number of packets to be held by queue + // @param queue Output parameter updated with handle of queue object + // @return bool true if successful, false otherwise + bool CreateQueue(const AgentInfo* agent_info, uint32_t num_pkts, hsa_queue_t** queue); + + // Create a Signal object and return its handle. + // @param value Initial value of signal object + // @param signal Output parameter updated with handle of signal object + // @return bool true if successful, false otherwise + bool CreateSignal(uint32_t value, hsa_signal_t* signal); + + // Allocate local GPU memory + // @param agent_info Agent from whose memory region to allocate + // @param size Size of memory in terms of bytes + // @return uint8_t* Pointer to buffer, null if allocation fails. + uint8_t* AllocateLocalMemory(const AgentInfo* agent_info, size_t size); + + // Allocate memory tp pass kernel parameters + // Memory is alocated accessible for all CPU agents and for GPU given by AgentInfo parameter. + // @param agent_info Agent from whose memory region to allocate + // @param size Size of memory in terms of bytes + // @return uint8_t* Pointer to buffer, null if allocation fails. + uint8_t* AllocateKernArgMemory(const AgentInfo* agent_info, size_t size); + + // Allocate system memory accessible from both CPU and GPU + // Memory is alocated accessible to all CPU agents and AgentInfo parameter is ignored. + // @param agent_info Agent from whose memory region to allocate + // @param size Size of memory in terms of bytes + // @return uint8_t* Pointer to buffer, null if allocation fails. + uint8_t* AllocateSysMemory(const AgentInfo* agent_info, size_t size); + + // Allocate memory for command buffer. + // @param agent_info Agent from whose memory region to allocate + // @param size Size of memory in terms of bytes + // @return uint8_t* Pointer to buffer, null if allocation fails. + uint8_t* AllocateCmdMemory(const AgentInfo* agent_info, size_t size); + + // Copy data from GPU to host memory + bool Memcpy(const hsa_agent_t& agent, void* dst, const void* src, size_t size); + bool Memcpy(const AgentInfo* agent_info, void* dst, const void* src, size_t size); + + // Memory free method + static bool FreeMemory(void* ptr); + + // Loads an Assembled Brig file and Finalizes it into Device Isa + // @param agent_info Gpu device for which to finalize + // @param brig_path File path of the Assembled Brig file + // @param kernel_name Name of the kernel to finalize + // @param code_desc Handle of finalized Code Descriptor that could + // be used to submit for execution + // @return true if successful, false otherwise + bool LoadAndFinalize(const AgentInfo* agent_info, const char* brig_path, const char* kernel_name, + hsa_executable_t* hsa_exec, hsa_executable_symbol_t* code_desc); + + // Print the various fields of Hsa Gpu Agents + bool PrintGpuAgents(const std::string& header); + + // Submit AQL packet to given queue + static uint64_t Submit(hsa_queue_t* queue, const void* packet); + static uint64_t Submit(hsa_queue_t* queue, const void* packet, size_t size_bytes); + + // Return AqlProfile API table + typedef hsa_ven_amd_aqlprofile_1_00_pfn_t aqlprofile_pfn_t; + const aqlprofile_pfn_t* AqlProfileApi() const { return &aqlprofile_api_; } + + // Return Loader API table + const hsa_ven_amd_loader_1_00_pfn_t* LoaderApi() const { return &loader_api_; } + + private: + // System agents iterating callback + static hsa_status_t GetHsaAgentsCallback(hsa_agent_t agent, void* data); + + // Callback function to find and bind kernarg region of an agent + static hsa_status_t FindMemRegionsCallback(hsa_region_t region, void* data); + + // Load AQL profile HSA extension library directly + static hsa_status_t LoadAqlProfileLib(aqlprofile_pfn_t* api); + + // Constructor of the class. Will initialize the Hsa Runtime and + // query the system topology to get the list of Cpu and Gpu devices + explicit HsaRsrcFactory(bool initialize_hsa); + + // Destructor of the class + ~HsaRsrcFactory(); + + // Add an instance of AgentInfo representing a Hsa Gpu agent + const AgentInfo* AddAgentInfo(const hsa_agent_t agent); + + // To mmap command buffer memory + static const bool CMD_MEMORY_MMAP = false; + + // HSA was initialized + const bool initialize_hsa_; + + static HsaRsrcFactory* instance_; + static mutex_t mutex_; + + // Used to maintain a list of Hsa Gpu Agent Info + std::vector gpu_list_; + std::vector gpu_agents_; + + // Used to maintain a list of Hsa Cpu Agent Info + std::vector cpu_list_; + std::vector cpu_agents_; + + // System agents map + std::map agent_map_; + + // AqlProfile API table + aqlprofile_pfn_t aqlprofile_api_; + + // Loader API table + hsa_ven_amd_loader_1_00_pfn_t loader_api_; +}; + + +#endif // TEST_UTIL_HSA_RSRC_FACTORY_H_ diff --git a/test/util/perf_timer.cpp b/test/util/perf_timer.cpp new file mode 100644 index 00000000..85c490b6 --- /dev/null +++ b/test/util/perf_timer.cpp @@ -0,0 +1,179 @@ +/****************************************************************************** +Copyright (c) 2018 Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*******************************************************************************/ + +#include "util/perf_timer.h" + +PerfTimer::PerfTimer() { freq_in_100mhz_ = MeasureTSCFreqHz(); } + +PerfTimer::~PerfTimer() { + while (!timers_.empty()) { + Timer* temp = timers_.back(); + timers_.pop_back(); + delete temp; + } +} + +// New cretaed timer instantance index will be returned +int PerfTimer::CreateTimer() { + Timer* newTimer = new Timer; + newTimer->start = 0; + newTimer->clocks = 0; + +#ifdef _WIN32 + QueryPerformanceFrequency((LARGE_INTEGER*)&newTimer->freq); +#else + newTimer->freq = (long long)1.0E3; +#endif + + /* Push back the address of new Timer instance created */ + timers_.push_back(newTimer); + return (int)(timers_.size() - 1); +} + +int PerfTimer::StartTimer(int index) { + if (index >= (int)timers_.size()) { + Error("Cannot reset timer. Invalid handle."); + return FAILURE; + } + +#ifdef _WIN32 +// General Windows timing method +#ifndef _AMD + long long tmpStart; + QueryPerformanceCounter((LARGE_INTEGER*)&(tmpStart)); + timers_[index]->start = (double)tmpStart; +#else +// AMD Windows timing method +#endif +#else +// General Linux timing method +#ifndef _AMD + struct timeval s; + gettimeofday(&s, 0); + timers_[index]->start = s.tv_sec * 1.0E3 + ((double)(s.tv_usec / 1.0E3)); +#else + // AMD timing method + unsigned int unused; + timers_[index]->start = __rdtscp(&unused); +#endif +#endif + + return SUCCESS; +} + + +int PerfTimer::StopTimer(int index) { + double n = 0; + if (index >= (int)timers_.size()) { + Error("Cannot reset timer. Invalid handle."); + return FAILURE; + } +#ifdef _WIN32 +#ifndef _AMD + long long n1; + QueryPerformanceCounter((LARGE_INTEGER*)&(n1)); + n = (double)n1; +#else +// AMD Window Timing +#endif + +#else +// General Linux timing method +#ifndef _AMD + struct timeval s; + gettimeofday(&s, 0); + n = s.tv_sec * 1.0E3 + (double)(s.tv_usec / 1.0E3); +#else + // AMD Linux timing + unsigned int unused; + n = __rdtscp(&unused); +#endif +#endif + + n -= timers_[index]->start; + timers_[index]->start = 0; + +#ifndef _AMD + timers_[index]->clocks += n; +#else + // timers_[index]->clocks += 10 * n / freq_in_100mhz_; // unit is ns + timers_[index]->clocks += 1.0E-6 * 10 * n / freq_in_100mhz_; // convert to ms +#endif + + return SUCCESS; +} + +void PerfTimer::Error(std::string str) { std::cout << str << std::endl; } + + +double PerfTimer::ReadTimer(int index) { + if (index >= (int)timers_.size()) { + Error("Cannot read timer. Invalid handle."); + return FAILURE; + } + + double reading = double(timers_[index]->clocks); + + reading = double(reading / timers_[index]->freq); + + return reading; +} + + +uint64_t PerfTimer::CoarseTimestampUs() { +#ifdef _WIN32 + uint64_t freqHz, ticks; + QueryPerformanceFrequency((LARGE_INTEGER*)&freqHz); + QueryPerformanceCounter((LARGE_INTEGER*)&ticks); + + // Scale numerator and divisor until (ticks * 1000000) fits in uint64_t. + while (ticks > (1ULL << 44)) { + ticks /= 16; + freqHz /= 16; + } + + return (ticks * 1000000) / freqHz; +#else + struct timespec ts; + clock_gettime(CLOCK_MONOTONIC_RAW, &ts); + return uint64_t(ts.tv_sec) * 1000000 + ts.tv_nsec / 1000; +#endif +} + +uint64_t PerfTimer::MeasureTSCFreqHz() { + // Make a coarse interval measurement of TSC ticks for 1 gigacycles. + unsigned int unused; + uint64_t tscTicksEnd; + + uint64_t coarseBeginUs = CoarseTimestampUs(); + uint64_t tscTicksBegin = __rdtscp(&unused); + do { + tscTicksEnd = __rdtscp(&unused); + } while (tscTicksEnd - tscTicksBegin < 1000000000); + + uint64_t coarseEndUs = CoarseTimestampUs(); + + // Compute the TSC frequency and round to nearest 100MHz. + uint64_t coarseIntervalNs = (coarseEndUs - coarseBeginUs) * 1000; + uint64_t tscIntervalTicks = tscTicksEnd - tscTicksBegin; + return (tscIntervalTicks * 10 + (coarseIntervalNs / 2)) / coarseIntervalNs; +} diff --git a/test/util/perf_timer.h b/test/util/perf_timer.h new file mode 100644 index 00000000..bfd55324 --- /dev/null +++ b/test/util/perf_timer.h @@ -0,0 +1,83 @@ +/****************************************************************************** +Copyright (c) 2018 Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*******************************************************************************/ + +#ifndef TEST_UTIL_PERF_TIMER_H_ +#define TEST_UTIL_PERF_TIMER_H_ + +// Will use AMD timer or general Linux timer based on compilation flag +// Need to consider platform is Windows or Linux + +#include +#include +#include + +#if defined(_MSC_VER) +#include +#include +#include +#else +#if defined(__GNUC__) +#include +#include +#endif // __GNUC__ +#endif // _MSC_VER + +#include +#include +#include + +class PerfTimer { + public: + enum { SUCCESS = 0, FAILURE = 1 }; + + PerfTimer(); + ~PerfTimer(); + + // General Linux timing method + int CreateTimer(); + int StartTimer(int index); + int StopTimer(int index); + + // retrieve time + double ReadTimer(int index); + // write into a file + double WriteTimer(int index); + + private: + struct Timer { + std::string name; /* name of time object */ + long long freq; /* frequency */ + double clocks; /* number of ticks at end */ + double start; /* start point ticks */ + }; + + std::vector timers_; /* vector to Timer objects */ + double freq_in_100mhz_; + + // AMD timing method + uint64_t CoarseTimestampUs(); + uint64_t MeasureTSCFreqHz(); + + void Error(std::string str); +}; + +#endif // TEST_UTIL_PERF_TIMER_H_ diff --git a/test/util/test_assert.h b/test/util/test_assert.h new file mode 100644 index 00000000..ee183810 --- /dev/null +++ b/test/util/test_assert.h @@ -0,0 +1,46 @@ +/****************************************************************************** +Copyright (c) 2018 Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*******************************************************************************/ + +#ifndef TEST_CTRL_TEST_ASSERT_H_ +#define TEST_CTRL_TEST_ASSERT_H_ + +#define TEST_ASSERT(cond) \ + { \ + if (!(cond)) { \ + std::cerr << "Assert failed(" << #cond << ") at " << __FILE__ << ", line " << __LINE__ \ + << std::endl; \ + exit(-1); \ + } \ + } + +#define TEST_STATUS(cond) \ + { \ + if (!(cond)) { \ + std::cerr << "Test error at " << __FILE__ << ", line " << __LINE__ << std::endl; \ + const char* message; \ + rocprofiler_error_string(&message); \ + std::cerr << "ERROR: " << message << std::endl; \ + exit(-1); \ + } \ + } + +#endif // TEST_CTRL_TEST_ASSERT_H_ diff --git a/test/util/xml.h b/test/util/xml.h new file mode 100644 index 00000000..eb2f5074 --- /dev/null +++ b/test/util/xml.h @@ -0,0 +1,457 @@ +/****************************************************************************** +Copyright (c) 2018 Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*******************************************************************************/ + +#ifndef TEST_UTIL_XML_H_ +#define TEST_UTIL_XML_H_ + +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +namespace xml { + +class Xml { + public: + typedef std::vector token_t; + + struct level_t; + typedef std::vector nodes_t; + typedef std::map opts_t; + struct level_t { + std::string tag; + nodes_t nodes; + opts_t opts; + }; + typedef std::vector nodes_vec_t; + typedef std::map map_t; + + enum { DECL_STATE, BODY_STATE }; + + static Xml* Create(const std::string& file_name, const Xml* obj = NULL) { + Xml* xml = new Xml(file_name, obj); + if (xml != NULL) { + if (xml->Init() == false) { + delete xml; + xml = NULL; + } else { + const std::size_t pos = file_name.rfind('/'); + const std::string path = (pos != std::string::npos) ? file_name.substr(0, pos + 1) : ""; + + xml->PreProcess(); + nodes_t incl_nodes; + for (auto* node : xml->GetNodes("top.include")) { + if (node->opts.find("touch") == node->opts.end()) { + node->opts["touch"] = ""; + incl_nodes.push_back(node); + } + } + for (auto* incl : incl_nodes) { + const std::string& incl_name = path + incl->opts["file"]; + Xml* ixml = Create(incl_name, xml); + if (ixml == NULL) { + delete xml; + xml = NULL; + break; + } else { + delete ixml; + } + } + if (xml) { + xml->Process(); + } + } + } + + return xml; + } + + static void Destroy(Xml* xml) { delete xml; } + + std::string GetName() { return file_name_; } + + void AddExpr(const std::string& full_tag, const std::string& name, const std::string& expr) { + const std::size_t pos = full_tag.rfind('.'); + const std::size_t pos1 = (pos == std::string::npos) ? 0 : pos + 1; + const std::string level_tag = full_tag.substr(pos1); + level_t* level = new level_t; + (*map_)[full_tag].push_back(level); + level->tag = level_tag; + level->opts["name"] = name; + level->opts["expr"] = expr; + } + + void AddConst(const std::string& full_tag, const std::string& name, const uint64_t& val) { + std::ostringstream oss; + oss << val; + AddExpr(full_tag, name, oss.str()); + } + + nodes_t GetNodes(const std::string& global_tag) { return (*map_)[global_tag]; } + + template F ForEach(const F& f_i) { + F f = f_i; + if (map_) { + for (auto& entry : *map_) { + for (auto node : entry.second) { + if (f.fun(entry.first, node) == false) break; + } + } + } + return f; + } + + template F ForEach(const F& f_i) const { + F f = f_i; + if (map_) { + for (auto& entry : *map_) { + for (auto node : entry.second) { + if (f.fun(entry.first, node) == false) break; + } + } + } + return f; + } + + struct print_func { + bool fun(const std::string& global_tag, level_t* node) { + for (auto& opt : node->opts) { + std::cout << global_tag << "." << opt.first << " = " << opt.second << std::endl; + } + return true; + } + }; + + void Print() const { + std::cout << "XML file '" << file_name_ << "':" << std::endl; + ForEach(print_func()); + } + + private: + Xml(const std::string& file_name, const Xml* obj) + : file_name_(file_name), + file_line_(0), + data_size_(0), + index_(0), + state_(BODY_STATE), + comment_(false), + included_(false), + level_(NULL), + map_(NULL) { + if (obj != NULL) { + map_ = obj->map_; + level_ = obj->level_; + included_ = true; + } + } + + struct delete_func { + bool fun(const std::string&, level_t* node) { + delete node; + return true; + } + }; + + ~Xml() { + if (included_ == false) { + ForEach(delete_func()); + delete map_; + } + } + + bool Init() { + fd_ = open(file_name_.c_str(), O_RDONLY); + if (fd_ == -1) { + // perror((std::string("open XML file ") + file_name_).c_str()); + return false; + } + + if (map_ == NULL) { + map_ = new map_t; + if (map_ == NULL) return false; + AddLevel("top"); + } + + return true; + } + + void PreProcess() { + uint32_t ind = 0; + char buf[kBufSize]; + bool error = false; + + while (1) { + const uint32_t pos = lseek(fd_, 0, SEEK_CUR); + uint32_t size = read(fd_, buf, kBufSize); + if (size <= 0) break; + buf[size - 1] = '\0'; + + if (strncmp(buf, "#include \"", 10) == 0) { + for (ind = 0; (ind < size) && (buf[ind] != '\n'); ++ind) {} + if (ind == size) { + fprintf(stderr, "XML PreProcess failed, line size limit %zu\n", kBufSize); + error = true; + break; + } + buf[ind] = '\0'; + size = ind; + lseek(fd_, pos + ind + 1, SEEK_SET); + + for (ind = 10; (ind < size) && (buf[ind] != '"'); ++ind) {} + if (ind == size) { + error = true; + break; + } + buf[ind] = '\0'; + + AddLevel("include"); + AddOption("file", &buf[10]); + UpLevel(); + } + } + + if (error) { + fprintf(stderr, "XML PreProcess failed, line '%s'\n", buf); + exit(1); + } + + lseek(fd_, 0, SEEK_SET); + } + + void Process() { + token_t remainder; + + while (1) { + token_t token = (remainder.size()) ? remainder : NextToken(); + remainder.clear(); + + // token_t token1 = token; + // token1.push_back('\0'); + // std::cout << "> " << &token1[0] << std::endl; + + // End of file + if (token.size() == 0) break; + + switch (state_) { + case BODY_STATE: + if (token[0] == '<') { + bool node_begin = true; + unsigned ind = 1; + if (token[1] == '/') { + node_begin = false; + ++ind; + } + + unsigned i = ind; + while (i < token.size()) { + if (token[i] == '>') break; + ++i; + } + for (unsigned j = i + 1; j < token.size(); ++j) remainder.push_back(token[j]); + + if (i == token.size()) { + if (node_begin) + state_ = DECL_STATE; + else + BadFormat(token); + token.push_back('\0'); + } else { + token[i] = '\0'; + } + + const char* tag = &token[ind]; + if (node_begin) { + AddLevel(tag); + } else { + if (strncmp(CurrentLevel().c_str(), tag, strlen(tag)) != 0) { + token.back() = '>'; + BadFormat(token); + } + UpLevel(); + } + } else { + BadFormat(token); + } + break; + case DECL_STATE: + if (token[0] == '>') { + state_ = BODY_STATE; + for (unsigned j = 1; j < token.size(); ++j) remainder.push_back(token[j]); + continue; + } else { + token.push_back('\0'); + unsigned j = 0; + for (j = 0; j < token.size(); ++j) + if (token[j] == '=') break; + if (j == token.size()) BadFormat(token); + token[j] = '\0'; + const char* key = &token[0]; + const char* value = &token[j + 1]; + AddOption(key, value); + } + break; + default: + std::cout << "XML parser error: wrong state: " << state_ << std::endl; + exit(1); + } + } + } + + bool SpaceCheck() const { + bool cond = ((buffer_[index_] == ' ') || (buffer_[index_] == '\t')); + return cond; + } + + bool LineEndCheck() { + bool found = false; + if (buffer_[index_] == '\n') { + buffer_[index_] = ' '; + ++file_line_; + found = true; + comment_ = false; + } else if (comment_ || (buffer_[index_] == '#')) { + found = true; + comment_ = true; + } + return found; + } + + token_t NextToken() { + token_t token; + bool in_string = false; + bool special_symb = false; + + while (1) { + if (data_size_ == 0) { + data_size_ = read(fd_, buffer_, kBufSize); + if (data_size_ <= 0) break; + } + + if (token.empty()) { + while ((index_ < data_size_) && (SpaceCheck() || LineEndCheck())) { + ++index_; + } + } + while ((index_ < data_size_) && (in_string || !(SpaceCheck() || LineEndCheck()))) { + const char symb = buffer_[index_]; + bool skip_symb = false; + + switch (symb) { + case '\\': + if (special_symb) { + special_symb = false; + } else { + special_symb = true; + skip_symb = true; + } + break; + case '"': + if (special_symb) { + special_symb = false; + } else { + in_string = !in_string; + if (!in_string) { + buffer_[index_] = ' '; + --index_; + } + skip_symb = true; + } + break; + } + + if (!skip_symb) token.push_back(symb); + ++index_; + } + + if (index_ == data_size_) { + index_ = 0; + data_size_ = 0; + } else { + if (special_symb || in_string) BadFormat(token); + break; + } + } + + return token; + } + + void BadFormat(token_t token) { + token.push_back('\0'); + std::cout << "Error: " << file_name_ << ", line " << file_line_ << ", bad XML token '" + << &token[0] << "'" << std::endl; + exit(1); + } + + void AddLevel(const std::string& tag) { + level_t* level = new level_t; + level->tag = tag; + if (level_) { + level_->nodes.push_back(level); + stack_.push_back(level_); + } + level_ = level; + + std::string global_tag; + for (level_t* level : stack_) { + global_tag += level->tag + "."; + } + global_tag += tag; + (*map_)[global_tag].push_back(level_); + } + + void UpLevel() { + level_ = stack_.back(); + stack_.pop_back(); + } + + std::string CurrentLevel() const { return level_->tag; } + + void AddOption(const std::string& key, const std::string& value) { level_->opts[key] = value; } + + const std::string file_name_; + unsigned file_line_; + int fd_; + + static const size_t kBufSize = 256; + char buffer_[kBufSize]; + + unsigned data_size_; + unsigned index_; + unsigned state_; + bool comment_; + std::vector stack_; + bool included_; + level_t* level_; + map_t* map_; +}; + +} // namespace xml + +#endif // TEST_UTIL_XML_H_ From 12f8613b5c7436bc5d9c2a6f4633f89701f549ab Mon Sep 17 00:00:00 2001 From: Evgeny Date: Tue, 17 Jul 2018 15:06:05 -0500 Subject: [PATCH 002/153] update --- LICENSE | 2 -- 1 file changed, 2 deletions(-) diff --git a/LICENSE b/LICENSE index fe4ce68b..9e78331e 100644 --- a/LICENSE +++ b/LICENSE @@ -1,4 +1,3 @@ -/****************************************************************************** Copyright (c) 2018 Advanced Micro Devices, Inc. All rights reserved. Permission is hereby granted, free of charge, to any person obtaining a copy @@ -18,4 +17,3 @@ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*******************************************************************************/ From edde6c11ac0bee2ced5c18446b7823bbb089665a Mon Sep 17 00:00:00 2001 From: Gregory Stoner Date: Sat, 18 Aug 2018 11:40:44 -0500 Subject: [PATCH 003/153] Update README.md --- README.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/README.md b/README.md index 5492d17d..3ac63141 100644 --- a/README.md +++ b/README.md @@ -2,6 +2,8 @@ ROC profiler library. Profiling with perf-counters and derived metrics. Library supports GFX8/GFX9. +HW specific low-level performance analysis interface for profiling of GPU compute applications. The profiling includes HW performance counters with complex performance metrics and HW traces + The library source tree: - doc - Documentation - inc/rocprofiler.h - Library public API From 05e63d2529c4d2b8133d74427d7820c62df19079 Mon Sep 17 00:00:00 2001 From: Evgeny Date: Sun, 19 Aug 2018 21:42:46 -0500 Subject: [PATCH 004/153] update, version 3.0 --- CMakeLists.txt | 24 ++- bin/rpl_run.sh | 352 +++++++++++++++++++++++++++++++++ bin/tblextr.py | 121 ++++++++++++ bin/txt2xml.sh | 94 +++++++++ cmake_modules/env.cmake | 13 +- cmake_modules/utils.cmake | 6 +- inc/rocprofiler.h | 24 ++- src/CMakeLists.txt | 6 +- src/core/context.h | 110 +++++------ src/core/group_set.h | 244 +++++++++++++++++++++++ src/core/intercept_queue.cpp | 3 +- src/core/intercept_queue.h | 178 ++++++++++++----- src/core/profile.h | 4 + src/core/rocprofiler.cpp | 85 +++++++- src/core/tracker.h | 166 +++++++++------- src/core/types.h | 1 + src/util/exception.h | 16 +- src/util/hsa_rsrc_factory.cpp | 110 +++++++---- src/util/hsa_rsrc_factory.h | 126 +++++++++--- src/util/logger.h | 66 +++++-- test/CMakeLists.txt | 12 +- test/run.sh | 13 +- test/tool/input1.xml | 5 + test/tool/tool.cpp | 187 +++++++++--------- test/util/hsa_rsrc_factory.cpp | 106 ++++++---- test/util/hsa_rsrc_factory.h | 126 +++++++++--- 26 files changed, 1729 insertions(+), 469 deletions(-) create mode 100755 bin/rpl_run.sh create mode 100755 bin/tblextr.py create mode 100755 bin/txt2xml.sh create mode 100644 src/core/group_set.h create mode 100644 test/tool/input1.xml diff --git a/CMakeLists.txt b/CMakeLists.txt index 6249e098..92bc348f 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,16 +1,16 @@ ################################################################################ # Copyright (c) 2018 Advanced Micro Devices, Inc. All rights reserved. -# +# # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal # in the Software without restriction, including without limitation the rights # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell # copies of the Software, and to permit persons to whom the Software is # furnished to do so, subject to the following conditions: -# +# # The above copyright notice and this permission notice shall be included in # all copies or substantial portions of the Software. -# +# # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE @@ -20,7 +20,7 @@ # THE SOFTWARE. ################################################################################ -cmake_minimum_required ( VERSION 3.5.0 ) +cmake_minimum_required ( VERSION 2.8.12 ) ## Verbose output. set ( CMAKE_VERBOSE_MAKEFILE TRUE CACHE BOOL "Verbose Output" FORCE ) @@ -76,6 +76,22 @@ add_subdirectory ( ${TEST_DIR} ${PROJECT_BINARY_DIR}/test ) ## Install information install ( TARGETS ${ROCPROFILER_TARGET} LIBRARY DESTINATION ${ROCPROFILER_NAME}/lib ) install ( FILES ${CMAKE_CURRENT_SOURCE_DIR}/inc/rocprofiler.h DESTINATION ${ROCPROFILER_NAME}/include ) +# rpl_run.sh tblextr.py txt2xml.sh +install ( FILES + ${CMAKE_CURRENT_SOURCE_DIR}/bin/rpl_run.sh + ${CMAKE_CURRENT_SOURCE_DIR}/bin/txt2xml.sh + ${CMAKE_CURRENT_SOURCE_DIR}/bin/tblextr.py + DESTINATION ${ROCPROFILER_NAME}/bin + PERMISSIONS OWNER_READ OWNER_EXECUTE GROUP_READ GROUP_EXECUTE WORLD_READ WORLD_EXECUTE ) +# gfx_metrics.xml metrics.xml +install ( FILES + ${CMAKE_CURRENT_SOURCE_DIR}/test/tool/metrics.xml + ${CMAKE_CURRENT_SOURCE_DIR}/test/tool/gfx_metrics.xml + DESTINATION ${ROCPROFILER_NAME}/lib ) +# libtool.so +install ( FILES ${PROJECT_BINARY_DIR}/test/libtool.so DESTINATION ${ROCPROFILER_NAME}/tool ) +install ( FILES ${PROJECT_BINARY_DIR}/test/ctrl DESTINATION ${ROCPROFILER_NAME}/tool + PERMISSIONS OWNER_READ OWNER_EXECUTE GROUP_READ GROUP_EXECUTE WORLD_READ WORLD_EXECUTE ) ## Packaging directives set ( CPACK_PACKAGE_NAME "${ROCPROFILER_NAME}-dev" ) diff --git a/bin/rpl_run.sh b/bin/rpl_run.sh new file mode 100755 index 00000000..64185761 --- /dev/null +++ b/bin/rpl_run.sh @@ -0,0 +1,352 @@ +################################################################################ +# Copyright (c) 2018 Advanced Micro Devices, Inc. All rights reserved. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +# THE SOFTWARE. +################################################################################ + +#!/bin/sh +time_stamp=`date +%y%m%d_%H%M%S` +BIN_DIR=`dirname $0` +BIN_DIR=`cd $BIN_DIR; pwd` +RUN_DIR=`pwd` +TMP_DIR="/tmp" +DATA_PATH=$TMP_DIR +DATA_DIR="rpl_data_${time_stamp}_$$" + +PKG_DIR=`echo $BIN_DIR | sed "s/\/bin\/*//"` +BIN_DIR=$PKG_DIR/bin + +# PATH to custom HSA and OpenCl runtimes +HSA_PATH=$PKG_DIR/lib/hsa + +export LD_LIBRARY_PATH=$PKG_DIR/lib:$PKG_DIR/tool:$HSA_PATH +export PATH=.:$PATH + +# enable error logging +export HSA_TOOLS_REPORT_LOAD_FAILURE=1 +export HSA_VEN_AMD_AQLPROFILE_LOG=1 +export ROCPROFILER_LOG=1 +unset ROCPROFILER_SESS + +# ROC Profiler environment +# Loading of ROC Profiler by HSA runtime +export HSA_TOOLS_LIB=librocprofiler64.so +# Loading of the test tool by ROC Profiler +export ROCP_TOOL_LIB=libtool.so +# Enabling HSA dispatches intercepting by ROC PRofiler +export ROCP_HSA_INTERCEPT=1 +# Disabling internal ROC Profiler proxy queue (simple version supported for testing purposes) +unset ROCP_PROXY_QUEUE +# ROC Profiler metrics definition +export ROCP_METRICS=$PKG_DIR/lib/metrics.xml +# Disable AQL-profile read API +export AQLPROFILE_READ_API=0 +# ROC Profiler package path +export ROCP_PACKAGE_DIR=$PKG_DIR + +# error handling +fatal() { + echo "$0: Error: $1" + echo "" + usage +} + +error() { + echo "$0: Error: $1" + echo "" + exit 1 +} + +# usage method +usage() { + bin_name=`basename $0` + echo "ROCm Profiling Library (RPL) run script, a part of ROCprofiler library package." + echo "Full path: $BIN_DIR/$bin_name" + echo "Metrics definition: $PKG_DIR/lib/metrics.xml" + echo "" + echo "Usage:" + echo " rpl_run.sh [-h] [--list-basic] [--list-derived] [-i ] [-o ] " + echo "" + echo "Options:" + echo " -h - this help" + echo " --verbose - verbose mode, dumping all base counters used in the input metrics" + echo " --list-basic - to print the list of basic HW counters" + echo " --list-derived - to print the list of derived metrics with formulas" + echo "" + echo " -i <.txt|.xml file> - input file" + echo " Input file .txt format, automatically rerun application for every pmc/sqtt line:" + echo "" + echo " # Perf counters group 1" + echo " pmc : Wavefronts VALUInsts SALUInsts SFetchInsts FlatVMemInsts LDSInsts FlatLDSInsts GDSInsts VALUUtilization FetchSize" + echo " # Perf counters group 2" + echo " pmc : WriteSize L2CacheHit" + echo " # Filter by dispatches range, GPU index and kernel names" + echo " # supported range formats: \"3:9\", \"3:\", \"3\"" + echo " range: 1 : 4" + echo " gpu: 0 1 2 3" + echo " kernel: simple Pass1 simpleConvolutionPass2" + echo "" + echo " Input file .xml format, for single profiling run:" + echo "" + echo " # Metrics list definition, also the form \":\" can be used" + echo " # All defined metrics can be found in the 'metrics.xml'" + echo " # There are basic metrics for raw HW counters and high-level metrics for derived counters" + echo " " + echo "" + echo " # Filter by dispatches range, GPU index and kernel names" + echo " " + echo "" + echo " -o - output CSV file [.csv]" + echo " -d - directory where profiler store profiling data including thread treaces [/tmp]" + echo " The data directory is renoving autonatically if the directory is matching the temporary one, which is the default." + echo " -t - to change the temporary directory [/tmp]" + echo " By changing the temporary directory you can prevent removing the profiling data from /tmp or enable removing from not '/tmp' directory." + echo "" + echo " --basenames - to turn on/off truncating of the kernel full function names till the base ones [off]" + echo " --timestamp - to turn on/off the kernel disoatches timestamps, dispatch/begin/end/complete [off]" + echo " --ctx-limit - maximum number of outstanding contexts [0 - unlimited]" + echo " --heartbeat - to print progress heartbeats [0 - disabled]" + echo " --sqtt-size - to set SQTT buffer size, aggregate for all SE [0x2000000]" + echo " Can be set in KB (1024B) or MB (1048576) units, examples 20K or 20M respectively." + echo " --sqtt-local - to allocate SQTT buffer in local GPU memory [on]" + echo "" + echo "Configuration file:" + echo " You can set your parameters defaults preferences in the configuration file 'rpl_rc.xml'. The search path sequence: .:${HOME}:" + echo " First the configuration file is looking in the current directory, then in your home, and then in the package directory." + echo " Configurable options: 'basenames', 'timestamp', 'ctx-limit', 'heartbeat', 'sqtt-size', 'sqtt-local'." + echo " An example of 'rpl_rc.xml':" + echo " " + echo "" + exit 1 +} + +# profiling run method +OUTPUT_LIST="" +run() { + export ROCP_INPUT="$1" + OUTPUT_DIR="$2" + shift + shift + APP_CMD=$* + + if [ "$OUTPUT_DIR" = "-" ] ; then + input_tag=`echo $ROCP_INPUT | sed "s/\.xml//"` + export ROCP_OUTPUT_DIR=${input_tag}_results_${time_stamp} + elif [ "$OUTPUT_DIR" = "--" ] ; then + unset ROCP_OUTPUT_DIR + else + export ROCP_OUTPUT_DIR=$OUTPUT_DIR + fi + echo "RPL: result dir '$ROCP_OUTPUT_DIR'" + + if [ ! -e "$ROCP_INPUT" ] ; then + error "Input file '$ROCP_INPUT' not found" + fi + + if [ -n "$ROCP_OUTPUT_DIR" ] ; then + if [ "$OUTPUT_DIR" = "-" ] ; then + if [ -e "$ROCP_OUTPUT_DIR" ] ; then + error "generated dir '$ROCP_OUTPUT_DIR' exists" + fi + fi + mkdir -p "$ROCP_OUTPUT_DIR" + fi + + if [ -n "$ROCP_OUTPUT_DIR" ] ; then + OUTPUT_LIST="$OUTPUT_LIST $ROCP_OUTPUT_DIR/results.txt" + eval "$APP_CMD 2>&1 | tee $ROCP_OUTPUT_DIR/log.txt" + else + eval "$APP_CMD" + fi +} + +# main +echo "RPL: on '$time_stamp' from '$PKG_DIR' in '$RUN_DIR'" +# Parsing arguments +if [ -z "$1" ] ; then + usage +fi + +INPUT_FILE="" +OUTPUT_DIR="-" +output="" +csv_output="" + +ARG_IN="" +while [ 1 ] ; do + ARG_IN=$1 + ARG_VAL=1 + if [ "$1" = "-h" ] ; then + usage + elif [ "$1" = "-i" ] ; then + INPUT_FILE="$2" + elif [ "$1" = "-o" ] ; then + output="$2" + elif [ "$1" = "-d" ] ; then + OUTPUT_DIR="$2" + DATA_PATH=$OUTPUT_DIR + elif [ "$1" = "-t" ] ; then + TMP_DIR="$2" + if [ "$OUTPUT_DIR" = "-" ] ; then + DATA_PATH=$TMP_DIR + fi + elif [ "$1" = "--list-basic" ] ; then + export ROCP_INFO=b + eval "$PKG_DIR/tool/ctrl" + exit 1 + elif [ "$1" = "--list-derived" ] ; then + export ROCP_INFO=d + eval "$PKG_DIR/tool/ctrl" + exit 1 + elif [ "$1" = "--basenames" ] ; then + if [ "$2" = "on" ] ; then + export ROCP_TRUNCATE_NAMES=1 + else + export ROCP_TRUNCATE_NAMES=0 + fi + elif [ "$1" = "--timestamp" ] ; then + if [ "$2" = "on" ] ; then + export ROCP_TIMESTAMP_ON=1 + else + export ROCP_TIMESTAMP_ON=0 + fi + elif [ "$1" = "--ctx-limit" ] ; then + export ROCP_OUTSTANDING_MAX="$2" + elif [ "$1" = "--heartbeat" ] ; then + export ROCP_OUTSTANDING_MON="$2" + elif [ "$1" = "--sqtt-size" ] ; then + size_m=`echo "$2" | sed -n "s/^\(.*\)M$/\1/p"` + size_k=`echo "$2" | sed -n "s/^\(.*\)K$/\1/p"` + if [ -n "$size_m" ] ; then size_b=$((size_m*1024*1024)) + elif [ -n "$size_k" ] ; then size_b=$((size_k*1024)) + else size_b=$2 + fi + export ROCP_SQTT_SIZE=$size_b + elif [ "$1" = "--sqtt-local" ] ; then + if [ "$2" = "on" ] ; then + export ROCP_SQTT_LOCAL=1 + else + export ROCP_SQTT_LOCAL=0 + fi + elif [ "$1" = "--verbose" ] ; then + ARG_VAL=0 + export ROCP_VERBOSE_MODE=1 + else + break + fi + shift + if [ "$ARG_VAL" = 1 ] ; then shift; fi +done + +ARG_CK=`echo $ARG_IN | sed "s/^-.*$/-/"` +if [ "$ARG_CK" = "-" ] ; then + fatal "Wrong option '$ARG_IN'" +fi + +if [ -z "$INPUT_FILE" ] ; then + fatal "Need input file" +fi + +input_base=`echo "$INPUT_FILE" | sed "s/^\(.*\)\.\([^\.]*\)$/\1/"` +input_type=`echo "$INPUT_FILE" | sed "s/^\(.*\)\.\([^\.]*\)$/\2/"` +if [ -z "${input_base}" -o -z "${input_type}" ] ; then + fatal "Bad input file '$INPUT_FILE'" +fi +input_base=`basename $input_base` + +if [ "$OUTPUT_DIR" = "--" ] ; then + fatal "Bad output dir '$OUTPUT_DIR'" +fi + +if [ -n "$output" ] ; then + if [ "$output" = "--" ] ; then + OUTPUT_DIR="--" + else + csv_output=$output + fi +else + csv_output=$RUN_DIR/${input_base}.csv +fi + +APP_CMD=$* + +echo "RPL: profiling '$APP_CMD'" +echo "RPL: input file '$INPUT_FILE'" + +input_list="" +RES_DIR="" +if [ "$input_type" = "xml" ] ; then + input_list=$INPUT_FILE +elif [ "$input_type" = "txt" ] ; then + OUTPUT_DIR="-" + RES_DIR=$DATA_PATH/$DATA_DIR + if [ -e $RES_DIR ] ; then + error "Rundir '$RES_DIR' exists" + fi + mkdir -p $RES_DIR + echo "RPL: output dir '$RES_DIR'" + $BIN_DIR/txt2xml.sh $INPUT_FILE $RES_DIR + input_list=`/bin/ls $RES_DIR/input*.xml` + export ROCPROFILER_SESS=$RES_DIR +else + fatal "Bad input file type '$INPUT_FILE'" +fi + +if [ -n "$csv_output" ] ; then + rm -f $csv_output +fi + +for name in $input_list; do + run $name $OUTPUT_DIR $APP_CMD + if [ -n "$ROCPROFILER_SESS" -a -e "$ROCPROFILER_SESS/error" ] ; then + echo "Error found, profiling aborted." + csv_output="" + break + fi +done + +if [ -n "$csv_output" ] ; then + python $BIN_DIR/tblextr.py $csv_output $OUTPUT_LIST + if [ "$?" -eq 0 ] ; then + echo "RPL: '$csv_output' is generated" + fi +fi + +if [ "$DATA_PATH" = "$TMP_DIR" ] ; then + if [ -e "$RES_DIR" ] ; then + rm -rf $RES_DIR + fi +fi + +exit 0 diff --git a/bin/tblextr.py b/bin/tblextr.py new file mode 100755 index 00000000..630417ce --- /dev/null +++ b/bin/tblextr.py @@ -0,0 +1,121 @@ +################################################################################ +# Copyright (c) 2018 Advanced Micro Devices, Inc. All rights reserved. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +# THE SOFTWARE. +################################################################################ + +#!/usr/bin/python +import os, sys, re + +# Parsing results in the format: +#dispatch[0], queue_index(0), kernel_name("SimpleConvolution"), time(1048928000311041,1048928006154674,1048928006168274,1048928006170503): +# GRBM_GUI_ACTIVE (74332) +# SQ_WAVES (4096) +# SQ_INSTS_VMEM_RD (36864) + +# global vars +var_list = ['Index', 'KernelName', 'DispatchNs', 'BeginNs', 'EndNs', 'CompleteNs'] +var_table = {} +############################################################# + +def fatal(msg): + sys.stderr.write(sys.argv[0] + ": " + msg + "\n"); + sys.exit(1) +############################################################# + +# parse results method +def parse_res(infile): + if not os.path.isfile(infile): fatal("Error: input file '" + infile + "' not found") + inp = open(infile, 'r') + + beg_pattern = re.compile("^dispatch\[(\d*)\], queue_index\(\d*\), kernel_name\(\"([^\"]*)\"\)") + ts_pattern = re.compile(", time\((\d*),(\d*),(\d*),(\d*)\)") + var_pattern = re.compile("^\s*([^\s]*)\s+\((\d*)\)") + + dispatch_number = 0 + for line in inp.readlines(): + record = line[:-1] + + m = var_pattern.match(record) + if m: + if not dispatch_number in var_table: fatal("Error: dispatch number not unique '" + str(dispatch_number) + "'") + var = m.group(1) + val = m.group(2) + var_table[dispatch_number][m.group(1)] = m.group(2) + if not var in var_list: var_list.append(var) + + m = beg_pattern.match(record) + if m: + dispatch_number = m.group(1) + if not dispatch_number in var_table: + var_table[dispatch_number] = { + 'Index': dispatch_number, + 'KernelName': "\"" + m.group(2) + "\"" + } + m = ts_pattern.search(record) + if m: + var_table[dispatch_number]['DispatchNs'] = m.group(1) + var_table[dispatch_number]['BeginNs'] = m.group(2) + var_table[dispatch_number]['EndNs'] = m.group(3) + var_table[dispatch_number]['CompleteNs'] = m.group(4) + + inp.close() +############################################################# + +# print results table method +def print_tbl(outfile): + global var_list + if len(var_table) == 0: return 1 + + out = open(outfile, 'w') + + keys = var_table.keys() + keys.sort(key=int) + + entry = var_table[keys[0]] + list1 = [] + for var in var_list: + if var in entry: + list1.append(var) + var_list = list1 + + for var in var_list: out.write(var + ',') + out.write("\n") + + for ind in keys: + entry = var_table[ind] + dispatch_number = entry['Index'] + if ind != dispatch_number: fatal("Dispatch #" + ind + " index mismatch (" + dispatch_number + ")\n") + for var in var_list: out.write(entry[var] + ',') + out.write("\n") + + out.close() + return 0 +############################################################# + +# main +if (len(sys.argv) < 3): fatal("Usage: " + sys.argv[0] + " ") + +outfile = sys.argv[1] +infiles = sys.argv[2:] +for f in infiles : + parse_res(f) +ret = print_tbl(outfile) +sys.exit(ret) +############################################################# diff --git a/bin/txt2xml.sh b/bin/txt2xml.sh new file mode 100755 index 00000000..9881160d --- /dev/null +++ b/bin/txt2xml.sh @@ -0,0 +1,94 @@ +################################################################################ +# Copyright (c) 2018 Advanced Micro Devices, Inc. All rights reserved. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +# THE SOFTWARE. +################################################################################ + +#!/bin/bash +timestamp=`date +%y%m%d_%H%M%S` + +if [ $# = 0 ] ; then + echo "Usage: $0 [output dir]" + exit -1 +fi + +input=$1 +outdir=$2 +if [ -z "$outdir" ] ; then + outdir="." +fi + +range="" +kernel="" +gpu_index="" + +parse() { + scan="$1" + index=0 + while read -r line ; do + line=`echo $line | sed "s/\s*#.*$//"` + if [ -z "$line" ] ; then + continue + fi + + feature=`echo $line | sed -n "s/^\s*\([a-z]*\)\s*:.*$/\1/p"` + line=`echo $line | sed "s/^[^:]*:\s*//"` + line=`echo "$line" | sed -e "s/\s*=\s*/=/g" -e "s/\s*:\s*/:/g" -e "s/,\{1,\}/ /g" -e "s/\s\{1,\}/ /g" -e "s/\s*$//"` + + if [ "$scan" = 0 ] ; then + line=`echo "$line" | sed -e "s/ /,/g"` + if [ "$feature" == "range" ] ; then + range=$line + fi + if [ "$feature" == "kernel" ] ; then + kernel=$line + fi + if [ "$feature" == "gpu" ] ; then + gpu_index=$line + fi + else + output=$outdir/input${index}.xml + header="# $timestamp '$output' generated with '$0 $*'" + + if [ "$feature" == "pmc" ] ; then + line=`echo "$line" | sed -e "s/ /,/g"` + cat >> $output < + +EOF + fi + + if [ "$feature" == "sqtt" ] ; then + cat >> $output < + +EOF + fi + fi + + index=$((index + 1)) + done < $input +} + +parse 0 +parse 1 + +exit 0 diff --git a/cmake_modules/env.cmake b/cmake_modules/env.cmake index ca7c4804..a71acb66 100644 --- a/cmake_modules/env.cmake +++ b/cmake_modules/env.cmake @@ -1,16 +1,16 @@ ################################################################################ # Copyright (c) 2018 Advanced Micro Devices, Inc. All rights reserved. -# +# # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal # in the Software without restriction, including without limitation the rights # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell # copies of the Software, and to permit persons to whom the Software is # furnished to do so, subject to the following conditions: -# +# # The above copyright notice and this permission notice shall be included in # all copies or substantial portions of the Software. -# +# # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE @@ -48,6 +48,8 @@ set ( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fno-threadsafe-statics" ) set ( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fmerge-all-constants" ) set ( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fms-extensions" ) set ( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fmerge-all-constants" ) +set ( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Werror=unused-result" ) +#set ( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Werror=int-in-bool-context" ) set ( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fPIC" ) set ( CMAKE_SHARED_LINKER_FLAGS "-Wl,-Bdynamic -Wl,-z,noexecstack" ) @@ -64,6 +66,11 @@ if ( DEFINED ENV{CMAKE_DEBUG_TRACE} ) add_definitions ( -DDEBUG_TRACE=1 ) endif() +## Enable AQL-profile new API +if ( NOT DEFINED ENV{CMAKE_CURR_API} ) + add_definitions ( -DAQLPROF_NEW_API=1 ) +endif() + ## Enable direct loading of AQL-profile HSA extension if ( DEFINED ENV{CMAKE_LD_AQLPROFILE} ) add_definitions ( -DROCP_LD_AQLPROFILE=1 ) diff --git a/cmake_modules/utils.cmake b/cmake_modules/utils.cmake index 15865820..f95a7833 100644 --- a/cmake_modules/utils.cmake +++ b/cmake_modules/utils.cmake @@ -1,16 +1,16 @@ ################################################################################ # Copyright (c) 2018 Advanced Micro Devices, Inc. All rights reserved. -# +# # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal # in the Software without restriction, including without limitation the rights # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell # copies of the Software, and to permit persons to whom the Software is # furnished to do so, subject to the following conditions: -# +# # The above copyright notice and this permission notice shall be included in # all copies or substantial portions of the Software. -# +# # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE diff --git a/inc/rocprofiler.h b/inc/rocprofiler.h index e7a5a1e0..17106687 100644 --- a/inc/rocprofiler.h +++ b/inc/rocprofiler.h @@ -45,8 +45,8 @@ THE SOFTWARE. #include #include -#define ROCPROFILER_VERSION_MAJOR 1 -#define ROCPROFILER_VERSION_MINOR 1 +#define ROCPROFILER_VERSION_MAJOR 3 +#define ROCPROFILER_VERSION_MINOR 0 #ifdef __cplusplus extern "C" { @@ -178,7 +178,7 @@ hsa_status_t rocprofiler_open(hsa_agent_t agent, // GPU han uint32_t mode, // profiling mode mask rocprofiler_properties_t* properties); // profiling properties -// Add feature to e features set +// Add feature to a features set hsa_status_t rocprofiler_add_feature(const rocprofiler_feature_t* feature, // [in] rocprofiler_feature_set_t* features_set); // [in/out] profiling features set @@ -204,10 +204,10 @@ hsa_status_t rocprofiler_reset(rocprofiler_t* context, // [in] profiling contex // Dispatch record typedef struct { - uint64_t dispatch; // dispatch timestamp - uint64_t begin; // begin timestamp - uint64_t end; // end timestamp - uint64_t complete; // completion signal timestamp + uint64_t dispatch; // dispatch timestamp, ns + uint64_t begin; // kernel begin timestamp, ns + uint64_t end; // kernel end timestamp, ns + uint64_t complete; // completion signal timestamp, ns } rocprofiler_dispatch_record_t; // Profiling callback data @@ -326,8 +326,11 @@ typedef struct { union { struct { const char* name; // metric name + uint32_t instances; // instances number const char* expr; // metric expression, NULL for basic counters const char* description; // metric description + const char* block_name; // block name + uint32_t block_counters; // number of block counters } metric; struct { const char* name; // trace name @@ -357,6 +360,13 @@ hsa_status_t rocprofiler_query_info( hsa_status_t (*callback)(const rocprofiler_info_data_t info, void *data), // callback void *data); // [in/out] data passed to callback +// Creates a profiled queue. All dispatches on this queue will be profiled +hsa_status_t rocprofiler_queue_create_profiled( + hsa_agent_t agent_handle,uint32_t size, hsa_queue_type32_t type, + void (*callback)(hsa_status_t status, hsa_queue_t* source, void* data), + void* data, uint32_t private_segment_size, uint32_t group_segment_size, + hsa_queue_t** queue); + #ifdef __cplusplus } // extern "C" block #endif // __cplusplus diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 45bc2719..9a398411 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -1,16 +1,16 @@ ################################################################################ # Copyright (c) 2018 Advanced Micro Devices, Inc. All rights reserved. -# +# # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal # in the Software without restriction, including without limitation the rights # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell # copies of the Software, and to permit persons to whom the Software is # furnished to do so, subject to the following conditions: -# +# # The above copyright notice and this permission notice shall be included in # all copies or substantial portions of the Software. -# +# # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE diff --git a/src/core/context.h b/src/core/context.h index 966acaef..f7ad792d 100644 --- a/src/core/context.h +++ b/src/core/context.h @@ -27,10 +27,14 @@ THE SOFTWARE. #include #include +#include // usleep +#include +#include #include #include #include +#include "core/group_set.h" #include "core/metrics.h" #include "core/profile.h" #include "core/queue.h" @@ -47,26 +51,6 @@ inline unsigned align_size(unsigned size, unsigned alignment) { return ((size + alignment - 1) & ~(alignment - 1)); } -// Block descriptor -struct block_des_t { - uint32_t id; - uint32_t index; -}; - -// block_des_t less-then functor -struct lt_block_des { - bool operator()(const block_des_t& a1, const block_des_t& a2) const { - return (a1.id < a2.id) || ((a1.id == a2.id) && (a1.index < a2.index)); - } -}; - -// Block status -struct block_status_t { - uint32_t max_counters; - uint32_t counter_index; - uint32_t group_index; -}; - // Metrics arguments template class MetricArgs : public xml::args_cache_t { public: @@ -94,6 +78,9 @@ template class MetricArgs : public xml::args_cache_t { // Profiling group class Group { public: + typedef uint32_t refs_t; + typedef std::atomic atomic_refs_t; + Group(const util::AgentInfo* agent_info, Context* context, const uint32_t& index) : pmc_profile_(agent_info), sqtt_profile_(agent_info), @@ -143,10 +130,10 @@ class Group { Context* GetContext() { return context_; } uint32_t GetIndex() const { return index_; } - void ResetRefs() { refs_ = n_profiles_; } - uint32_t DecrRefs() { - return (refs_ > 0) ? --refs_ : 0; - } + atomic_refs_t* AtomicRefsCount() { return reinterpret_cast(&refs_); } + void ResetRefsCount() { AtomicRefsCount()->store(n_profiles_, std::memory_order_release); } + void IncrRefsCount() { AtomicRefsCount()->fetch_add(1, std::memory_order_acq_rel); } + uint32_t FetchDecrRefsCount() { return AtomicRefsCount()->fetch_sub(1, std::memory_order_acq_rel); } private: PmcProfile pmc_profile_; @@ -156,7 +143,7 @@ class Group { pkt_vector_t stop_vector_; pkt_vector_t read_vector_; uint32_t n_profiles_; - uint32_t refs_; + refs_t refs_; Context* const context_; const uint32_t index_; }; @@ -164,7 +151,6 @@ class Group { // Profiling context class Context { public: - typedef std::mutex mutex_t; typedef std::map info_map_t; Context(const util::AgentInfo* agent_info, Queue* queue, rocprofiler_feature_t* info, @@ -177,14 +163,21 @@ class Context { handler_(handler), handler_arg_(handler_arg) { + if (info_count == 0) return; + metrics_ = MetricsDict::Create(agent_info); if (metrics_ == NULL) EXC_RAISING(HSA_STATUS_ERROR, "MetricsDict create failed"); - Initialize(info, info_count); + if (Initialize(info, info_count) == false) { + fprintf(stdout, "\nInput metrics out of HW limit. Proposed metrics group set:\n"); fflush(stdout); + MetricsGroupSet(agent_info, info, info_count).Print(stdout); + fprintf(stdout, "\n"); fflush(stdout); + EXC_RAISING(HSA_STATUS_ERROR, "Metrics list exceeds HW limits"); + } Finalize(); if (handler != NULL) { for (unsigned group_index = 0; group_index < set_.size(); ++group_index) { - set_[group_index].ResetRefs(); + set_[group_index].ResetRefsCount(); const profile_vector_t profile_vector = GetProfiles(group_index); for (auto& tuple : profile_vector) { // Handler for stop packet completion @@ -207,7 +200,7 @@ class Context { } // Initialize rocprofiler context - void Initialize(rocprofiler_feature_t* info_array, const uint32_t info_count) { + bool Initialize(rocprofiler_feature_t* info_array, const uint32_t info_count) { // Register input features to not duplicate by features referencing for (unsigned i = 0; i < info_count; ++i) { rocprofiler_feature_t* info = &info_array[i]; @@ -270,9 +263,12 @@ class Context { block_status.max_counters = block_counters; } if (block_status.counter_index >= block_status.max_counters) { + return false; + block_status.counter_index = 0; block_status.group_index += 1; } + block_status.counter_index += 1; if (block_status.group_index >= set_.size()) { set_.push_back(Group(agent_info_, this, block_status.group_index)); } @@ -285,6 +281,8 @@ class Context { EXC_RAISING(HSA_STATUS_ERROR, "bad rocprofiler feature kind (" << kind << ")"); } } + + return true; } void Finalize() { @@ -294,11 +292,11 @@ class Context { } } - void Reset(const uint32_t& group_index) { set_[group_index].ResetRefs(); } + void Reset(const uint32_t& group_index) { set_[group_index].ResetRefsCount(); } uint32_t GetGroupCount() const { return set_.size(); } - rocprofiler_group_t GetGroupInfo(Group* g) { + inline rocprofiler_group_t GetGroupInfo(Group* g) { rocprofiler::info_vector_t& info_vector = g->GetInfoVector(); rocprofiler_group_t group = {}; group.index = g->GetIndex(); @@ -307,8 +305,14 @@ class Context { group.feature_count = info_vector.size(); return group; } - rocprofiler_group_t GetGroupInfo(const uint32_t& index) { - return GetGroupInfo(&set_[index]); + inline rocprofiler_group_t GetGroupInfo(const uint32_t& index) { + rocprofiler_group_t group = {}; + if (set_.empty()) { + group.context = reinterpret_cast(this); + } else { + group = GetGroupInfo(&set_[index]); + } + return group; } const pkt_vector_t& StartPackets(const uint32_t& group_index) const { @@ -355,14 +359,7 @@ class Context { const profile_vector_t profile_vector = GetProfiles(group_index); for (auto& tuple : profile_vector) { // Wait for stop packet to complete - const uint64_t timeout = timeout_; - bool complete = false; - while (!complete) { - const hsa_signal_value_t signal_value = hsa_signal_wait_scacquire(tuple.completion_signal, HSA_SIGNAL_CONDITION_LT, 1, timeout, - HSA_WAIT_STATE_BLOCKED); - complete = (signal_value < 1); - if (!complete) WARN_LOGGING("timeout"); - } + hsa_rsrc_->SignalWaitRestore(tuple.completion_signal, 1); for (rocprofiler_feature_t* rinfo : *(tuple.info_vector)) rinfo->data.kind = ROCPROFILER_DATA_KIND_UNINIT; callback_data_t callback_data{tuple.profile, tuple.info_vector, tuple.info_vector->size(), NULL}; const hsa_status_t status = @@ -398,8 +395,19 @@ class Context { } } - static void SetTimeout(uint64_t timeout) { timeout_ = timeout; } - static uint64_t GetTimeout() { return timeout_; } + static bool Handler(hsa_signal_value_t value, void* arg) { + Group* group = reinterpret_cast(arg); + Context* context = group->GetContext(); + auto r = group->FetchDecrRefsCount(); + if (r == 1) { + const rocprofiler_group_t group_info = context->GetGroupInfo(group); + context->handler_(group_info, context->handler_arg_); + } + return false; + } + + Group* GetGroup(const uint32_t& index) { return &set_[index]; } + rocprofiler_handler_t GetHandler(void** arg) const { *arg = handler_arg_; return handler_; } private: // Getting profling packets @@ -412,18 +420,6 @@ class Context { return vec; } - static bool Handler(hsa_signal_value_t value, void* arg) { - Group* group = reinterpret_cast(arg); - Context* context = group->GetContext(); - context->mutex_.lock(); - uint32_t r = group->DecrRefs(); - context->mutex_.unlock(); - if (r == 0) { - return context->handler_(context->GetGroupInfo(group), context->handler_arg_); - } - return false; - } - static hsa_status_t DataCallback(hsa_ven_amd_aqlprofile_info_type_t ainfo_type, hsa_ven_amd_aqlprofile_info_data_t* ainfo_data, void* data) { hsa_status_t status = HSA_STATUS_SUCCESS; @@ -513,9 +509,6 @@ class Context { return info; } - // Profiling data waiting timeout - static uint64_t timeout_; - // GPU handel const hsa_agent_t agent_; const util::AgentInfo* agent_info_; @@ -538,7 +531,6 @@ class Context { // Context completion handler rocprofiler_handler_t handler_; void* handler_arg_; - mutex_t mutex_; }; } // namespace rocprofiler diff --git a/src/core/group_set.h b/src/core/group_set.h new file mode 100644 index 00000000..b255079b --- /dev/null +++ b/src/core/group_set.h @@ -0,0 +1,244 @@ +/****************************************************************************** +Copyright (c) 2018 Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*******************************************************************************/ + +#ifndef SRC_CORE_GROUP_SET_H_ +#define SRC_CORE_GROUP_SET_H_ + +#include +#include +#include + +#include "core/metrics.h" +#include "util/exception.h" +#include "util/hsa_rsrc_factory.h" + +namespace rocprofiler { + +// Block descriptor +struct block_des_t { + uint32_t id; + uint32_t index; +}; + +// block_des_t less-then functor +struct lt_block_des { + bool operator()(const block_des_t& a1, const block_des_t& a2) const { + return (a1.id < a2.id) || ((a1.id == a2.id) && (a1.index < a2.index)); + } +}; + +// Block status +struct block_status_t { + uint32_t max_counters; + uint32_t counter_index; + uint32_t group_index; +}; + +// Metrics set class +class MetricsGroup { + public: + // Info map type + typedef std::map info_map_t; + // Blocks map type + typedef std::map blocks_map_t; + + MetricsGroup(const util::AgentInfo* agent_info) : + agent_info_(agent_info) + { + metrics_ = MetricsDict::Create(agent_info); + if (metrics_ == NULL) EXC_RAISING(HSA_STATUS_ERROR, "MetricsDict create failed"); + } + + void Print(FILE* file) const { + for (const Metric* metric : metrics_vec_) { + fprintf(file, " %s", metric->GetName().c_str()); fflush(stdout); + } + fprintf(file, "\n"); fflush(stdout); + } + + static const Metric* GetMetric(const MetricsDict* metrics, const std::string& name) { + // Metric object + const Metric* metric = metrics->Get(name); + if (metric == NULL) + EXC_RAISING(HSA_STATUS_ERROR, "input metric '" << name << "' is not found"); + return metric; + } + + static const Metric* GetMetric(const MetricsDict* metrics, const rocprofiler_feature_t* info) { + // Metrics name + const char* name = info->name; + if (name == NULL) EXC_RAISING(HSA_STATUS_ERROR, "input feature name is NULL"); + const Metric* metric = GetMetric(metrics, name); +#if 0 + std::cout << " " << name << (metric->GetExpr() ? " = " + metric->GetExpr()->String() : " counter") << std::endl; +#endif + return metric; + } + + // Add metric + bool AddMetric(const rocprofiler_feature_t* info) { + return AddMetric(GetMetric(metrics_, info)); + } + + bool AddMetric(const Metric* metric) { + // Blocks utilization delta + blocks_map_t blocks_delta; + + // Process metrics counters + const counters_vec_t& counters_vec = metric->GetCounters(); + if (counters_vec.empty()) + EXC_RAISING(HSA_STATUS_ERROR, "bad metric '" << metric->GetName() << "' is empty"); + + for (const counter_t* counter : counters_vec) { + const event_t* event = &(counter->event); + + // For metrics expressions checking that there is no the same counter in the input metrics + // and also that the counter wasn't registered already by another input metric expression + if (info_map_.find(counter->name) != info_map_.end()) continue; + + const block_des_t block_des = {event->block_name, event->block_index}; + auto ret = blocks_map_.insert({block_des, {}}); + block_status_t& block_status = ret.first->second; + if (ret.second == true) { + profile_t query = {}; + query.agent = agent_info_->dev_id; + query.type = HSA_VEN_AMD_AQLPROFILE_EVENT_TYPE_PMC; + query.events = event; + + uint32_t block_counters; + hsa_status_t status = util::HsaRsrcFactory::Instance().AqlProfileApi()->hsa_ven_amd_aqlprofile_get_info( + &query, HSA_VEN_AMD_AQLPROFILE_INFO_BLOCK_COUNTERS, &block_counters); + if (status != HSA_STATUS_SUCCESS) AQL_EXC_RAISING(status, "get block_counters info"); + block_status.max_counters = block_counters; + } + + ret = blocks_delta.insert({block_des, block_status}); + block_status_t& delta_status = ret.first->second; + delta_status.counter_index += 1; + if (delta_status.counter_index > delta_status.max_counters) return false; + } + + // Register metric + metrics_vec_.push_back(metric); + info_map_[metric->GetName()] = metric; + for (const counter_t* counter : counters_vec) { + if (info_map_.find(counter->name) == info_map_.end()) info_map_[counter->name] = NewCounterInfo(counter->name); + } + for (const auto& entry : blocks_delta) { + blocks_map_[entry.first] = entry.second; + } + + return true; + } + + private: + const Metric* NewCounterInfo(const std::string& name) const { + return GetMetric(metrics_, name); + } + + // Agent info + const util::AgentInfo* const agent_info_; + // Metrics dictionary + const MetricsDict* metrics_; + // Info map + info_map_t info_map_; + // Blocks map + blocks_map_t blocks_map_; + // Metrics vector + std::vector metrics_vec_; +}; + +// Metrics groups class +class MetricsGroupSet { + public: + MetricsGroupSet(const util::AgentInfo* agent_info, const rocprofiler_feature_t* info_array, const uint32_t info_count) : + agent_info_(agent_info) + { + metrics_ = MetricsDict::Create(agent_info); + if (metrics_ == NULL) EXC_RAISING(HSA_STATUS_ERROR, "MetricsDict create failed"); + Initialize(info_array, info_count); + } + + ~MetricsGroupSet() { + for (auto* group : groups_) delete group; + } + + uint32_t GetSize() const { return groups_.size(); } + + void Print(FILE* file) const { + uint32_t idx = 0; + for (const auto* group : groups_) { + ++idx; + fprintf(stdout, " group%u:", idx); fflush(stdout); + group->Print(file); + } + } + + private: + void Initialize(const rocprofiler_feature_t* info_array, const uint32_t info_count) { + std::multimap > input_metrics; + for (unsigned i = 0; i < info_count; ++i) { + const rocprofiler_feature_t* info = &info_array[i]; + if (info->kind != ROCPROFILER_FEATURE_KIND_METRIC) continue; + const Metric* metric = MetricsGroup::GetMetric(metrics_, info); + const uint32_t counters_num = metric->GetCounters().size(); + input_metrics.insert({counters_num, metric}); + + if (MetricsGroup(agent_info_).AddMetric(metric) == false) { + AQL_EXC_RAISING(HSA_STATUS_ERROR, "Metric '" << metric->GetName() << "' doesn't fit in one group"); + } + } +#if 0 + for (const auto& entry : input_metrics) { + printf("%u %s\n", entry.first, entry.second->GetName().c_str()); + } +#endif + auto end = input_metrics.end(); + while (!input_metrics.empty()) { + MetricsGroup* group = NextGroup(); + auto it = input_metrics.begin(); + do { + auto curr = it++; + const Metric* metric = curr->second; + if (group->AddMetric(metric) == true) { + input_metrics.erase(curr); + } + } while (it != end); + } + } + + MetricsGroup* NextGroup() { + groups_.push_back(new MetricsGroup(agent_info_)); + return groups_.back(); + } + + // Agent info + const util::AgentInfo* const agent_info_; + // Metrics dictionary + const MetricsDict* metrics_; + // Metrics group vector + std::vector groups_; +}; + +} // namespace rocprofiler + +#endif // SRC_CORE_GROUP_SET_H_ diff --git a/src/core/intercept_queue.cpp b/src/core/intercept_queue.cpp index a2a289aa..7703c662 100644 --- a/src/core/intercept_queue.cpp +++ b/src/core/intercept_queue.cpp @@ -34,7 +34,8 @@ InterceptQueue::queue_callback_t InterceptQueue::destroy_callback_ = NULL; void* InterceptQueue::callback_data_ = NULL; InterceptQueue::obj_map_t* InterceptQueue::obj_map_ = NULL; const char* InterceptQueue::kernel_none_ = ""; -uint64_t InterceptQueue::timeout_ = UINT64_MAX; Tracker* InterceptQueue::tracker_ = NULL; bool InterceptQueue::tracker_on_ = false; +bool InterceptQueue::in_constr_call_ = false; + } // namespace rocprofiler diff --git a/src/core/intercept_queue.h b/src/core/intercept_queue.h index c5376bb9..627a718a 100644 --- a/src/core/intercept_queue.h +++ b/src/core/intercept_queue.h @@ -48,51 +48,68 @@ class InterceptQueue { typedef std::recursive_mutex mutex_t; typedef std::map obj_map_t; typedef hsa_status_t (*queue_callback_t)(hsa_queue_t*, void* data); + typedef void (*queue_event_callback_t)(hsa_status_t status, hsa_queue_t *queue, void *arg); static void HsaIntercept(HsaApiTable* table); - static hsa_status_t QueueCreate(hsa_agent_t agent, uint32_t size, hsa_queue_type32_t type, + static hsa_status_t InterceptQueueCreate(hsa_agent_t agent, uint32_t size, hsa_queue_type32_t type, void (*callback)(hsa_status_t status, hsa_queue_t* source, void* data), void* data, uint32_t private_segment_size, - uint32_t group_segment_size, hsa_queue_t** queue) { - hsa_status_t status = HSA_STATUS_ERROR; + uint32_t group_segment_size, hsa_queue_t** queue, + const bool& tracker_on) { std::lock_guard lck(mutex_); + hsa_status_t status = HSA_STATUS_ERROR; + + if (in_constr_call_) EXC_ABORT(status, "recursive InterceptQueueCreate()"); + in_constr_call_ = true; - ProxyQueue* proxy = ProxyQueue::Create(agent, size, type, callback, data, private_segment_size, + ProxyQueue* proxy = ProxyQueue::Create(agent, size, type, queue_event_callback, data, private_segment_size, group_segment_size, queue, &status); - if (status != HSA_STATUS_SUCCESS) abort(); + if (status != HSA_STATUS_SUCCESS) EXC_ABORT(status, "ProxyQueue::Create()"); - if (tracker_on_ && (tracker_ == NULL)) { - tracker_ = new Tracker(timeout_); + if (tracker_on || tracker_on_) { + if (tracker_ == NULL) tracker_ = new Tracker; status = hsa_amd_profiling_set_profiler_enabled(*queue, true); - if (status != HSA_STATUS_SUCCESS) abort(); + if (status != HSA_STATUS_SUCCESS) EXC_ABORT(status, "hsa_amd_profiling_set_profiler_enabled()"); } if (!obj_map_) obj_map_ = new obj_map_t; InterceptQueue* obj = new InterceptQueue(agent, *queue, proxy); (*obj_map_)[(uint64_t)(*queue)] = obj; status = proxy->SetInterceptCB(OnSubmitCB, obj); + obj->queue_event_callback_ = callback; + in_constr_call_ = false; return status; } + static hsa_status_t QueueCreate(hsa_agent_t agent, uint32_t size, hsa_queue_type32_t type, + void (*callback)(hsa_status_t status, hsa_queue_t* source, + void* data), + void* data, uint32_t private_segment_size, + uint32_t group_segment_size, hsa_queue_t** queue) { + return InterceptQueueCreate(agent, size, type, callback, data, private_segment_size, group_segment_size, queue, false); + } + + static hsa_status_t QueueCreateTracked(hsa_agent_t agent, uint32_t size, hsa_queue_type32_t type, + void (*callback)(hsa_status_t status, hsa_queue_t* source, + void* data), + void* data, uint32_t private_segment_size, + uint32_t group_segment_size, hsa_queue_t** queue) { + return InterceptQueueCreate(agent, size, type, callback, data, private_segment_size, group_segment_size, queue, true); + } + static hsa_status_t QueueDestroy(hsa_queue_t* queue) { std::lock_guard lck(mutex_); hsa_status_t status = HSA_STATUS_ERROR; - if (destroy_callback_ != NULL) { - status = destroy_callback_(queue, callback_data_); - if (status != HSA_STATUS_SUCCESS) return status; - } + if (destroy_callback_ != NULL) { + status = destroy_callback_(queue, callback_data_); + } - obj_map_t::iterator it = obj_map_->find((uint64_t)queue); - if (it != obj_map_->end()) { - const InterceptQueue* obj = it->second; - assert(queue == obj->queue_); - delete obj; - obj_map_->erase(it); - status = HSA_STATUS_SUCCESS; + if (status == HSA_STATUS_SUCCESS) { + status = DelObj(queue); } return status; @@ -104,47 +121,73 @@ class InterceptQueue { InterceptQueue* obj = reinterpret_cast(data); Queue* proxy = obj->proxy_; + // Travers input packets for (uint64_t j = 0; j < count; ++j) { - bool to_submit = true; const packet_t* packet = &packets_arr[j]; + bool to_submit = true; + // Checking for dispatch packet type if ((GetHeaderType(packet) == HSA_PACKET_TYPE_KERNEL_DISPATCH) && (dispatch_callback_ != NULL)) { - rocprofiler_group_t group = {}; const hsa_kernel_dispatch_packet_t* dispatch_packet = reinterpret_cast(packet); - const char* kernel_name = GetKernelName(dispatch_packet); - const rocprofiler_dispatch_record_t* record = NULL; + + // Adding kernel timing tracker + Tracker::entry_t* tracker_entry = NULL; if (tracker_ != NULL) { - const auto* entry = tracker_->Add(obj->agent_info_->dev_id, dispatch_packet->completion_signal); - const_cast(dispatch_packet)->completion_signal = entry->signal; - record = entry->record; + tracker_entry = tracker_->Alloc(obj->agent_info_->dev_id, dispatch_packet->completion_signal); + const_cast(dispatch_packet)->completion_signal = tracker_entry->signal; } + + // Prepareing dispatch callback data + const char* kernel_name = GetKernelName(dispatch_packet); rocprofiler_callback_data_t data = {obj->agent_info_->dev_id, obj->agent_info_->dev_index, obj->queue_, user_que_idx, dispatch_packet, kernel_name, - record}; + (tracker_entry) ? tracker_entry->record : NULL}; + + // Calling dispatch callback + rocprofiler_group_t group = {}; hsa_status_t status = dispatch_callback_(&data, callback_data_, &group); free(const_cast(kernel_name)); - if ((status == HSA_STATUS_SUCCESS) && (group.context != NULL)) { + // Injecting profiling start/stop packets + if ((status != HSA_STATUS_SUCCESS) || (group.context == NULL)) { + if (tracker_entry != NULL) tracker_->Delete(tracker_entry); + } else { Context* context = reinterpret_cast(group.context); - const pkt_vector_t& start_vector = context->StartPackets(group.index); - const pkt_vector_t& stop_vector = context->StopPackets(group.index); - - pkt_vector_t packets = start_vector; - packets.insert(packets.end(), *packet); - packets.insert(packets.end(), stop_vector.begin(), stop_vector.end()); - if (writer != NULL) { - writer(&packets[0], packets.size()); + + if (group.feature_count != 0) { + if (tracker_entry != NULL) { + Group* context_group = context->GetGroup(group.index); + context_group->IncrRefsCount(); + tracker_->Enable(tracker_entry, Context::Handler, reinterpret_cast(context_group)); + } + + const pkt_vector_t& start_vector = context->StartPackets(group.index); + const pkt_vector_t& stop_vector = context->StopPackets(group.index); + pkt_vector_t packets = start_vector; + packets.insert(packets.end(), *packet); + packets.insert(packets.end(), stop_vector.begin(), stop_vector.end()); + if (writer != NULL) { + writer(&packets[0], packets.size()); + } else { + proxy->Submit(&packets[0], packets.size()); + } + to_submit = false; } else { - proxy->Submit(&packets[0], packets.size()); + if (tracker_entry != NULL) { + void* context_handler_arg = NULL; + rocprofiler_handler_t context_handler_fun = context->GetHandler(&context_handler_arg); + tracker_->Enable(tracker_entry, context_handler_fun, context_handler_arg); + rocprofiler_close(context); + } } - to_submit = false; } } + // Submitting the original packets if profiling was not enabled if (to_submit) { if (writer != NULL) { writer(packet, 1); @@ -152,8 +195,6 @@ class InterceptQueue { proxy->Submit(packet, 1); } } - - packet += 1; } } @@ -164,22 +205,19 @@ class InterceptQueue { destroy_callback_ = destroy_callback; } - static void SetTimeout(uint64_t timeout) { timeout_ = timeout; } static void TrackerOn(bool on) { tracker_on_ = on; } static bool IsTrackerOn() { return tracker_on_; } private: - InterceptQueue(const hsa_agent_t& agent, hsa_queue_t* const queue, ProxyQueue* proxy) : - queue_(queue), - proxy_(proxy) - { - agent_info_ = util::HsaRsrcFactory::Instance().GetAgentInfo(agent); + static void queue_event_callback(hsa_status_t status, hsa_queue_t *queue, void *arg) { + if (status != HSA_STATUS_SUCCESS) EXC_ABORT(status, "queue error handling is not supported"); + InterceptQueue* obj = GetObj(queue); + if (obj->queue_event_callback_) obj->queue_event_callback_(status, obj->queue_, arg); } - ~InterceptQueue() { ProxyQueue::Destroy(proxy_); } - static packet_word_t GetHeaderType(const packet_t* packet) { + static hsa_packet_type_t GetHeaderType(const packet_t* packet) { const packet_word_t* header = reinterpret_cast(packet); - return (*header >> HSA_PACKET_HEADER_TYPE) & header_type_mask; + return static_cast((*header >> HSA_PACKET_HEADER_TYPE) & header_type_mask); } static const char* GetKernelName(const hsa_kernel_dispatch_packet_t* dispatch_packet) { @@ -209,6 +247,45 @@ class InterceptQueue { return funcname; } + // method to get an intercept queue object + static InterceptQueue* GetObj(const hsa_queue_t* queue) { + std::lock_guard lck(mutex_); + InterceptQueue* obj = NULL; + obj_map_t::const_iterator it = obj_map_->find((uint64_t)queue); + if (it != obj_map_->end()) { + obj = it->second; + assert(queue == obj->queue_); + } + return obj; + } + + // method to delete an intercept queue object + static hsa_status_t DelObj(const hsa_queue_t* queue) { + std::lock_guard lck(mutex_); + hsa_status_t status = HSA_STATUS_ERROR; + obj_map_t::const_iterator it = obj_map_->find((uint64_t)queue); + if (it != obj_map_->end()) { + const InterceptQueue* obj = it->second; + assert(queue == obj->queue_); + delete obj; + obj_map_->erase(it); + status = HSA_STATUS_SUCCESS;; + } + return status; + } + + InterceptQueue(const hsa_agent_t& agent, hsa_queue_t* const queue, ProxyQueue* proxy) : + queue_(queue), + proxy_(proxy) + { + agent_info_ = util::HsaRsrcFactory::Instance().GetAgentInfo(agent); + queue_event_callback_ = NULL; + } + + ~InterceptQueue() { + ProxyQueue::Destroy(proxy_); + } + static mutex_t mutex_; static const packet_word_t header_type_mask = (1ul << HSA_PACKET_HEADER_WIDTH_TYPE) - 1; static rocprofiler_callback_t dispatch_callback_; @@ -216,13 +293,14 @@ class InterceptQueue { static void* callback_data_; static obj_map_t* obj_map_; static const char* kernel_none_; - static uint64_t timeout_; static Tracker* tracker_; static bool tracker_on_; + static bool in_constr_call_; hsa_queue_t* const queue_; ProxyQueue* const proxy_; const util::AgentInfo* agent_info_; + queue_event_callback_t queue_event_callback_; }; } // namespace rocprofiler diff --git a/src/core/profile.h b/src/core/profile.h index 43d30a21..6d91192b 100644 --- a/src/core/profile.h +++ b/src/core/profile.h @@ -140,10 +140,14 @@ class Profile { if (status != HSA_STATUS_SUCCESS) AQL_EXC_RAISING(status, "aqlprofile_start"); status = api->hsa_ven_amd_aqlprofile_stop(&profile_, &stop); if (status != HSA_STATUS_SUCCESS) AQL_EXC_RAISING(status, "aqlprofile_stop"); +#ifdef AQLPROF_NEW_API hsa_status_t rd_status = api->hsa_ven_amd_aqlprofile_read(&profile_, &read); #if 0 // Read API returns error if disabled if (rd_status != HSA_STATUS_SUCCESS) AQL_EXC_RAISING(status, "aqlprofile_read"); #endif +#else + hsa_status_t rd_status = HSA_STATUS_ERROR; +#endif // Set completion signal hsa_signal_t dummy_signal{}; diff --git a/src/core/rocprofiler.cpp b/src/core/rocprofiler.cpp index a96fadba..e8901387 100644 --- a/src/core/rocprofiler.cpp +++ b/src/core/rocprofiler.cpp @@ -152,7 +152,7 @@ bool LoadTool() { settings.intercept_mode = (intercept_mode) ? 1 : 0; settings.sqtt_size = SqttProfile::GetSize(); settings.sqtt_local = SqttProfile::IsLocal() ? 1: 0; - settings.timeout = Context::GetTimeout(); + settings.timeout = util::HsaRsrcFactory::GetTimeoutNs(); settings.timestamp_on = InterceptQueue::IsTrackerOn() ? 1 : 0; if (handler) handler(); @@ -161,8 +161,7 @@ bool LoadTool() { intercept_mode = (settings.intercept_mode != 0); SqttProfile::SetSize(settings.sqtt_size); SqttProfile::SetLocal(settings.sqtt_local != 0); - Context::SetTimeout(settings.timeout); - InterceptQueue::SetTimeout(settings.timeout); + util::HsaRsrcFactory::SetTimeoutNs(settings.timeout); InterceptQueue::TrackerOn(settings.timestamp_on != 0); } @@ -188,8 +187,8 @@ CONSTRUCTOR_API void constructor() { } DESTRUCTOR_API void destructor() { - util::HsaRsrcFactory::Destroy(); rocprofiler::MetricsDict::Destroy(); + util::HsaRsrcFactory::Destroy(); util::Logger::Destroy(); } @@ -211,10 +210,8 @@ hsa_status_t GetExcStatus(const std::exception& e) { } rocprofiler_properties_t rocprofiler_properties; -uint64_t Context::timeout_ = UINT64_MAX; uint32_t SqttProfile::output_buffer_size_ = 0x2000000; // 32M bool SqttProfile::output_buffer_local_ = true; -Tracker::mutex_t Tracker::mutex_; util::Logger::mutex_t util::Logger::mutex_; util::Logger* util::Logger::instance_ = NULL; } @@ -230,11 +227,36 @@ PUBLIC_API bool OnLoad(HsaApiTable* table, uint64_t runtime_version, uint64_t fa rocprofiler::SaveHsaApi(table); rocprofiler::ProxyQueue::InitFactory(); bool intercept_mode = false; + + // Checking environment to enable intercept mode const char* intercept_env = getenv("ROCP_HSA_INTERCEPT"); if (intercept_env != NULL) { - if (strncmp(intercept_env, "1", 1) == 0) intercept_mode = true; + switch (atoi(intercept_env)) { + // Intercepting disabled + case 0: + intercept_mode = false; + rocprofiler::InterceptQueue::TrackerOn(false); + break; + // Intercepting enabled without timestamping + case 1: + intercept_mode = true; + rocprofiler::InterceptQueue::TrackerOn(false); + break; + // Intercepting enabled with timestamping + case 2: + intercept_mode = true; + rocprofiler::InterceptQueue::TrackerOn(true); + break; + default: + ERR_LOGGING("Bad ROCP_HSA_INTERCEPT env var value (" << intercept_env << ")"); + return false; + } } - if (rocprofiler::LoadTool()) intercept_mode = true; + + // Loading a tool lib and setting of intercept mode + const bool intercept_mode_on = rocprofiler::LoadTool(); + if (intercept_mode_on) intercept_mode = true; + // HSA intercepting if (intercept_mode) { rocprofiler::ProxyQueue::HsaIntercept(table); @@ -479,6 +501,43 @@ PUBLIC_API hsa_status_t rocprofiler_iterate_info( info.metric.name = strdup(name.c_str()); info.metric.description = strdup(descr.c_str()); info.metric.expr = expr.empty() ? NULL : strdup(expr.c_str()); + + if (expr.empty()) { + // Getting the block name + const std::string block_name = node->opts["block"]; + + // Querying profile + rocprofiler::profile_t profile = {}; + profile.agent = agent_info->dev_id; + profile.type = HSA_VEN_AMD_AQLPROFILE_EVENT_TYPE_PMC; + + // Query block id info + hsa_ven_amd_aqlprofile_id_query_t query = {block_name.c_str(), 0, 0}; + hsa_status_t status = rocprofiler::util::HsaRsrcFactory::Instance().AqlProfileApi()->hsa_ven_amd_aqlprofile_get_info( + &profile, HSA_VEN_AMD_AQLPROFILE_INFO_BLOCK_ID, &query); + if (status != HSA_STATUS_SUCCESS) AQL_EXC_RAISING(HSA_STATUS_ERROR, "get block id info: '" << block_name << "'"); + + // Metric object + const std::string metric_name = (query.instance_count > 1) ? name + "[0]" : name; + const rocprofiler::Metric* metric = dict->Get(metric_name); + if (metric == NULL) EXC_RAISING(HSA_STATUS_ERROR, "metric '" << name << "' is not found"); + + // Process metrics counters + const rocprofiler::counters_vec_t& counters_vec = metric->GetCounters(); + if (counters_vec.size() != 1) EXC_RAISING(HSA_STATUS_ERROR, "error: '" << metric->GetName() << "' is not basic"); + + // Query block counters number + uint32_t block_counters; + profile.events = &(counters_vec[0]->event); + status = rocprofiler::util::HsaRsrcFactory::Instance().AqlProfileApi()->hsa_ven_amd_aqlprofile_get_info( + &profile, HSA_VEN_AMD_AQLPROFILE_INFO_BLOCK_COUNTERS, &block_counters); + if (status != HSA_STATUS_SUCCESS) continue; + + info.metric.instances = query.instance_count; + info.metric.block_name = block_name.c_str(); + info.metric.block_counters = block_counters; + } + status = callback(info, data); if (status != HSA_STATUS_SUCCESS) break; } @@ -519,4 +578,14 @@ PUBLIC_API hsa_status_t rocprofiler_query_info( API_METHOD_SUFFIX } +// Creates a profiled queue. All dispatches on this queue will be profiled +PUBLIC_API hsa_status_t rocprofiler_queue_create_profiled( + hsa_agent_t agent, uint32_t size, hsa_queue_type32_t type, + void (*callback)(hsa_status_t status, hsa_queue_t* source, void* data), + void* data, uint32_t private_segment_size, uint32_t group_segment_size, + hsa_queue_t** queue) +{ + return rocprofiler::InterceptQueue::QueueCreateTracked(agent, size, type, callback, data, private_segment_size, group_segment_size, queue); +} + } // extern "C" diff --git a/src/core/tracker.h b/src/core/tracker.h index eae0c112..acbf5cf6 100644 --- a/src/core/tracker.h +++ b/src/core/tracker.h @@ -28,9 +28,11 @@ THE SOFTWARE. #include #include +#include #include #include +#include "util/hsa_rsrc_factory.h" #include "inc/rocprofiler.h" #include "util/exception.h" #include "util/logger.h" @@ -39,12 +41,13 @@ namespace rocprofiler { class Tracker { public: - typedef uint64_t timestamp_t; - typedef long double freq_t; typedef std::mutex mutex_t; + typedef util::HsaRsrcFactory::timestamp_t timestamp_t; typedef rocprofiler_dispatch_record_t record_t; struct entry_t; typedef std::list sig_list_t; + typedef sig_list_t::iterator sig_list_it_t; + struct entry_t { Tracker* tracker; sig_list_t::iterator it; @@ -52,71 +55,59 @@ class Tracker { hsa_signal_t orig; hsa_signal_t signal; record_t* record; + std::atomic handler; + void* arg; + bool context_active; }; - Tracker(uint64_t timeout = UINT64_MAX) : timeout_(timeout), outstanding(0) { - timestamp_t timestamp_hz = 0; - hsa_status_t status = hsa_system_get_info(HSA_SYSTEM_INFO_TIMESTAMP_FREQUENCY, ×tamp_hz); - if (status != HSA_STATUS_SUCCESS) EXC_ABORT(status, "hsa_system_get_info(HSA_SYSTEM_INFO_TIMESTAMP_FREQUENCY)"); - timestamp_factor_ = (freq_t)1000000000 / (freq_t)timestamp_hz; - } + Tracker() : + outstanding_(0), + hsa_rsrc_(&(util::HsaRsrcFactory::Instance())) + {} + ~Tracker() { - mutex_.lock(); - for (entry_t* entry : sig_list_) { - assert(entry != NULL); - while (1) { - const hsa_signal_value_t signal_value = hsa_signal_wait_scacquire( - entry->signal, - HSA_SIGNAL_CONDITION_LT, - 1, - timeout_, - HSA_WAIT_STATE_BLOCKED); - if (signal_value < 1) break; - else WARN_LOGGING("tracker timeout"); - } - Del(entry); + auto it = sig_list_.begin(); + auto end = sig_list_.end(); + while (it != end) { + auto cur = it++; + hsa_rsrc_->SignalWait((*cur)->signal); + Erase(cur); } - mutex_.unlock(); } // Add tracker entry - entry_t* Add(const hsa_agent_t& agent, const hsa_signal_t& orig) { + entry_t* Alloc(const hsa_agent_t& agent, const hsa_signal_t& orig) { hsa_status_t status = HSA_STATUS_ERROR; + + // Creating a new tracker entry entry_t* entry = new entry_t{}; assert(entry); entry->tracker = this; - mutex_.lock(); - entry->it = sig_list_.insert(sig_list_.begin(), entry); - mutex_.unlock(); - entry->agent = agent; entry->orig = orig; - status = hsa_signal_create(1, 0, NULL, &(entry->signal)); - if (status != HSA_STATUS_SUCCESS) EXC_RAISING(status, "hsa_signal_create"); + // Creating a record with the dispatch timestamps record_t* record = new record_t{}; assert(record); + record->dispatch = hsa_rsrc_->TimestampNs(); entry->record = record; - status = hsa_system_get_info(HSA_SYSTEM_INFO_TIMESTAMP, &record->dispatch); - if (status != HSA_STATUS_SUCCESS) EXC_RAISING(status, "hsa_system_get_info(HSA_SYSTEM_INFO_TIMESTAMP)"); - hsa_amd_signal_async_handler(entry->signal, HSA_SIGNAL_CONDITION_LT, 1, Handler, entry); + // Creating a proxy signal + status = hsa_signal_create(1, 0, NULL, &(entry->signal)); + if (status != HSA_STATUS_SUCCESS) EXC_RAISING(status, "hsa_signal_create"); + status = hsa_amd_signal_async_handler(entry->signal, HSA_SIGNAL_CONDITION_LT, 1, Handler, entry); if (status != HSA_STATUS_SUCCESS) EXC_RAISING(status, "hsa_amd_signal_async_handler"); - if (trace_on_) { - mutex_.lock(); - entry->tracker->outstanding++; - fprintf(stdout, "Tracker::Add: entry %p, record %p, outst %lu\n", entry, entry->record, entry->tracker->outstanding); - fflush(stdout); - mutex_.unlock(); - } + // Adding antry to the list + mutex_.lock(); + entry->it = sig_list_.insert(sig_list_.begin(), entry); + mutex_.unlock(); return entry; } - private: // Delete tracker entry - void Del(entry_t* entry) { + void Delete(entry_t* entry) { hsa_signal_destroy(entry->signal); mutex_.lock(); sig_list_.erase(entry->it); @@ -124,31 +115,53 @@ class Tracker { delete entry; } - // Handler for packet completion - static bool Handler(hsa_signal_value_t value, void* arg) { - entry_t* entry = reinterpret_cast(arg); + // Enable tracker entry + void Enable(entry_t* entry, void* handler, void* arg) { + // Set entry handler and release the entry + entry->arg = arg; + entry->handler.store(handler, std::memory_order_release); + + // Debug trace + if (trace_on_) { + auto outstanding = outstanding_.fetch_add(1); + fprintf(stdout, "Tracker::Add: entry %p, record %p, outst %lu\n", entry, entry->record, outstanding); + fflush(stdout); + } + } + + void Enable(entry_t* entry, hsa_amd_signal_handler handler, void* arg) { + entry->context_active = true; + Enable(entry, reinterpret_cast(handler), arg); + } + void Enable(entry_t* entry, rocprofiler_handler_t handler, void* arg) { + Enable(entry, reinterpret_cast(handler), arg); + } + + private: + // Delete an entry by iterator + void Erase(const sig_list_it_t& it) { Delete(*it); } + + // Entry completion + void Complete(entry_t* entry) { record_t* record = entry->record; + // Debug trace if (trace_on_) { - mutex_.lock(); - entry->tracker->outstanding--; - fprintf(stdout, "Tracker::Handler: entry %p, record %p, outst %lu\n", entry, entry->record, entry->tracker->outstanding); + auto outstanding = outstanding_.fetch_sub(1); + fprintf(stdout, "Tracker::Handler: entry %p, record %p, outst %lu\n", entry, entry->record, outstanding); fflush(stdout); - mutex_.unlock(); } - timestamp_t complete_timestamp = 0; + // Query begin/end and complete timestamps hsa_amd_profiling_dispatch_time_t dispatch_time{}; - - hsa_status_t status = hsa_system_get_info(HSA_SYSTEM_INFO_TIMESTAMP, &complete_timestamp); - if (status != HSA_STATUS_SUCCESS) EXC_RAISING(status, "hsa_system_get_info(HSA_SYSTEM_INFO_TIMESTAMP)"); - status = hsa_amd_profiling_get_dispatch_time(entry->agent, entry->signal, &dispatch_time); + hsa_status_t status = hsa_amd_profiling_get_dispatch_time(entry->agent, entry->signal, &dispatch_time); if (status != HSA_STATUS_SUCCESS) EXC_RAISING(status, "hsa_amd_profiling_get_dispatch_time"); - record->complete = entry->tracker->timestamp2ns(complete_timestamp); - record->begin = entry->tracker->timestamp2ns(dispatch_time.start); - record->end = entry->tracker->timestamp2ns(dispatch_time.end); + record->begin = hsa_rsrc_->SysclockToNs(dispatch_time.start); + record->end = hsa_rsrc_->SysclockToNs(dispatch_time.end); + record->complete = hsa_rsrc_->TimestampNs(); + // Original intercepted signal completion hsa_signal_t orig = entry->orig; if (orig.handle) { amd_signal_t* orig_signal_ptr = reinterpret_cast(orig.handle); @@ -159,26 +172,41 @@ class Tracker { const hsa_signal_value_t value = hsa_signal_load_relaxed(orig); hsa_signal_store_screlease(orig, value - 1); } - entry->tracker->Del(entry); - - return false; } - inline timestamp_t timestamp2ns(const timestamp_t& timestamp) const { - const freq_t timestamp_ns = (freq_t)timestamp * timestamp_factor_; - return (timestamp_t)timestamp_ns; + // Handler for packet completion + static bool Handler(hsa_signal_value_t, void* arg) { + // Acquire entry + entry_t* entry = reinterpret_cast(arg); + volatile std::atomic* ptr = &entry->handler; + while (ptr->load(std::memory_order_acquire) == NULL) sched_yield(); + + // Complete entry + entry->tracker->Complete(entry); + + // Call entry handler + void* handler = static_cast(entry->handler); + if (entry->context_active) { + reinterpret_cast(handler)(0, entry->arg); + } else { + rocprofiler_group_t group{}; + reinterpret_cast(handler)(group, entry->arg); + } + + // Delete tracker entry + entry->tracker->Delete(entry); + + return false; } - // Timestamp frequency factor - freq_t timestamp_factor_; - // Timeout for wait on destruction - timestamp_t timeout_; // Tracked signals list sig_list_t sig_list_; // Inter-thread synchronization - static mutex_t mutex_; + mutex_t mutex_; // Outstanding dispatches - uint64_t outstanding; + std::atomic outstanding_; + // HSA resources factory + util::HsaRsrcFactory* hsa_rsrc_; // Enable tracing static const bool trace_on_ = false; }; diff --git a/src/core/types.h b/src/core/types.h index fd8bae33..c58d6cf2 100644 --- a/src/core/types.h +++ b/src/core/types.h @@ -32,6 +32,7 @@ typedef hsa_ven_amd_aqlprofile_parameter_t parameter_t; typedef hsa_ven_amd_aqlprofile_profile_t profile_t; typedef hsa_ext_amd_aql_pm4_packet_t packet_t; typedef uint32_t packet_word_t; +typedef uint64_t timestamp_t; } // namespace rocprofiler #endif // SRC_CORE_TYPES_H_ diff --git a/src/util/exception.h b/src/util/exception.h index 8af5f980..730028c2 100644 --- a/src/util/exception.h +++ b/src/util/exception.h @@ -30,27 +30,27 @@ THE SOFTWARE. #include #define EXC_ABORT(error, stream) \ - { \ + do { \ std::ostringstream oss; \ oss << __FUNCTION__ << "(), " << stream; \ - std::cout << oss.str() << std::endl; \ + std::cout << "error(" << error << ") \"" << oss.str() << "\"" << std::endl; \ abort(); \ - } + } while (0) #define EXC_RAISING(error, stream) \ - { \ + do { \ std::ostringstream oss; \ oss << __FUNCTION__ << "(), " << stream; \ throw rocprofiler::util::exception(error, oss.str()); \ - } + } while (0) #define AQL_EXC_RAISING(error, stream) \ - { \ + do { \ const char* error_string = NULL; \ - const rocprofiler::pfn_t* api = util::HsaRsrcFactory::Instance().AqlProfileApi(); \ + const rocprofiler::pfn_t* api = rocprofiler::util::HsaRsrcFactory::Instance().AqlProfileApi(); \ api->hsa_ven_amd_aqlprofile_error_string(&error_string); \ EXC_RAISING(error, stream << ", " << error_string); \ - } + } while (0) namespace rocprofiler { namespace util { diff --git a/src/util/hsa_rsrc_factory.cpp b/src/util/hsa_rsrc_factory.cpp index ff749d15..3c50d27d 100644 --- a/src/util/hsa_rsrc_factory.cpp +++ b/src/util/hsa_rsrc_factory.cpp @@ -1,24 +1,26 @@ -/****************************************************************************** -Copyright (c) 2018 Advanced Micro Devices, Inc. All rights reserved. - -Permission is hereby granted, free of charge, to any person obtaining a copy -of this software and associated documentation files (the "Software"), to deal -in the Software without restriction, including without limitation the rights -to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -copies of the Software, and to permit persons to whom the Software is -furnished to do so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in -all copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN -THE SOFTWARE. -*******************************************************************************/ +/********************************************************************** +Copyright ©2013 Advanced Micro Devices, Inc. All rights reserved. + +Redistribution and use in source and binary forms, with or without modification, are permitted +provided that the following conditions are met: + +<95> Redistributions of source code must retain the above copyright notice, this list of +conditions and the following disclaimer. +<95> Redistributions in binary form must reproduce the above copyright notice, this list of +conditions and the following disclaimer in the documentation and/or + other materials provided with the distribution. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR +IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT +SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY + DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS + OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, +WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +POSSIBILITY OF SUCH DAMAGE. +********************************************************************/ #include "util/hsa_rsrc_factory.h" @@ -42,6 +44,9 @@ THE SOFTWARE. #include #include +#include "util/exception.h" +#include "util/logger.h" + namespace rocprofiler { namespace util { @@ -108,14 +113,21 @@ hsa_status_t FindKernArgPool(hsa_amd_memory_pool_t pool, void* data) { // Constructor of the class HsaRsrcFactory::HsaRsrcFactory(bool initialize_hsa) : initialize_hsa_(initialize_hsa) { hsa_status_t status; + + cpu_pool_ = NULL; + kern_arg_pool_ = NULL; + // Initialize the Hsa Runtime if (initialize_hsa_) { status = hsa_init(); CHECK_STATUS("Error in hsa_init", status); } + // Discover the set of Gpu devices available on the platform status = hsa_iterate_agents(GetHsaAgentsCallback, this); CHECK_STATUS("Error Calling hsa_iterate_agents", status); + if (cpu_pool_ == NULL) CHECK_STATUS("CPU memory pool is not found", HSA_STATUS_ERROR); + if (kern_arg_pool_ == NULL) CHECK_STATUS("Kern-arg memory pool is not found", HSA_STATUS_ERROR); // Get AqlProfile API table aqlprofile_api_ = {0}; @@ -130,10 +142,19 @@ HsaRsrcFactory::HsaRsrcFactory(bool initialize_hsa) : initialize_hsa_(initialize loader_api_ = {0}; status = hsa_system_get_extension_table(HSA_EXTENSION_AMD_LOADER, 1, 0, &loader_api_); CHECK_STATUS("loader API table query failed", status); + + // Instantiate HSA timer + timer_ = new HsaTimer; + CHECK_STATUS("HSA timer allocation failed", + (timer_ == NULL) ? HSA_STATUS_ERROR : HSA_STATUS_SUCCESS); + + // System timeout + timeout_ = (timeout_ns_ == HsaTimer::TIMESTAMP_MAX) ? timeout_ns_ : timer_->ns_to_sysclock(timeout_ns_); } // Destructor of the class HsaRsrcFactory::~HsaRsrcFactory() { + delete timer_; for (auto p : cpu_list_) delete p; for (auto p : gpu_list_) delete p; if (initialize_hsa_) { @@ -160,8 +181,10 @@ hsa_status_t HsaRsrcFactory::LoadAqlProfileLib(aqlprofile_pfn_t* api) { (decltype(::hsa_ven_amd_aqlprofile_start)*)dlsym(handle, "hsa_ven_amd_aqlprofile_start"); api->hsa_ven_amd_aqlprofile_stop = (decltype(::hsa_ven_amd_aqlprofile_stop)*)dlsym(handle, "hsa_ven_amd_aqlprofile_stop"); +#ifdef AQLPROF_NEW_API api->hsa_ven_amd_aqlprofile_read = (decltype(::hsa_ven_amd_aqlprofile_read)*)dlsym(handle, "hsa_ven_amd_aqlprofile_read"); +#endif api->hsa_ven_amd_aqlprofile_legacy_get_pm4 = (decltype(::hsa_ven_amd_aqlprofile_legacy_get_pm4)*)dlsym( handle, "hsa_ven_amd_aqlprofile_legacy_get_pm4"); @@ -191,9 +214,9 @@ const AgentInfo* HsaRsrcFactory::AddAgentInfo(const hsa_agent_t agent) { agent_info->dev_index = cpu_list_.size(); status = hsa_amd_agent_iterate_memory_pools(agent, FindStandardPool, &agent_info->cpu_pool); - CHECK_ITER_STATUS("hsa_amd_agent_iterate_memory_pools(cpu pool)", status); + if ((status == HSA_STATUS_INFO_BREAK) && (cpu_pool_ == NULL)) cpu_pool_ = &agent_info->cpu_pool; status = hsa_amd_agent_iterate_memory_pools(agent, FindKernArgPool, &agent_info->kern_arg_pool); - CHECK_ITER_STATUS("hsa_amd_agent_iterate_memory_pools(kern arg pool)", status); + if ((status == HSA_STATUS_INFO_BREAK) && (kern_arg_pool_ == NULL)) kern_arg_pool_ = &agent_info->kern_arg_pool; agent_info->gpu_pool = {}; cpu_list_.push_back(agent_info); @@ -355,7 +378,7 @@ uint8_t* HsaRsrcFactory::AllocateKernArgMemory(const AgentInfo* agent_info, size uint8_t* buffer = NULL; if (!cpu_agents_.empty()) { size = (size + MEM_PAGE_MASK) & ~MEM_PAGE_MASK; - status = hsa_amd_memory_pool_allocate(cpu_list_[0]->kern_arg_pool, size, 0, reinterpret_cast(&buffer)); + status = hsa_amd_memory_pool_allocate(*kern_arg_pool_, size, 0, reinterpret_cast(&buffer)); // Both the CPU and GPU can access the kernel arguments if (status == HSA_STATUS_SUCCESS) { hsa_agent_t ag_list[1] = {agent_info->dev_id}; @@ -375,7 +398,7 @@ uint8_t* HsaRsrcFactory::AllocateSysMemory(const AgentInfo* agent_info, size_t s uint8_t* buffer = NULL; size = (size + MEM_PAGE_MASK) & ~MEM_PAGE_MASK; if (!cpu_agents_.empty()) { - status = hsa_amd_memory_pool_allocate(cpu_list_[0]->cpu_pool, size, 0, reinterpret_cast(&buffer)); + status = hsa_amd_memory_pool_allocate(*cpu_pool_, size, 0, reinterpret_cast(&buffer)); // Both the CPU and GPU can access the memory if (status == HSA_STATUS_SUCCESS) { hsa_agent_t ag_list[1] = {agent_info->dev_id}; @@ -399,22 +422,38 @@ uint8_t* HsaRsrcFactory::AllocateCmdMemory(const AgentInfo* agent_info, size_t s return ptr; } +// Wait signal +void HsaRsrcFactory::SignalWait(const hsa_signal_t& signal) const { + while (1) { + const hsa_signal_value_t signal_value = + hsa_signal_wait_scacquire(signal, HSA_SIGNAL_CONDITION_LT, 1, timeout_, HSA_WAIT_STATE_BLOCKED); + if (signal_value == 0) { + break; + } else { + if (signal_value == 1) WARN_LOGGING("signal waiting..."); + else EXC_RAISING(HSA_STATUS_ERROR, "hsa_signal_wait_scacquire (" << signal_value << ")"); + } + } +} + +// Wait signal with signal value restore +void HsaRsrcFactory::SignalWaitRestore(const hsa_signal_t& signal, const hsa_signal_value_t& signal_value) const { + SignalWait(signal); + hsa_signal_store_relaxed(const_cast(signal), signal_value); +} + // Copy data from GPU to host memory bool HsaRsrcFactory::Memcpy(const hsa_agent_t& agent, void* dst, const void* src, size_t size) { hsa_status_t status = HSA_STATUS_ERROR; if (!cpu_agents_.empty()) { hsa_signal_t s = {}; status = hsa_signal_create(1, 0, NULL, &s); - if (status == HSA_STATUS_SUCCESS) { - status = hsa_amd_memory_async_copy(dst, cpu_agents_[0], src, agent, size, 0, NULL, s); - if (status == HSA_STATUS_SUCCESS) { - if (hsa_signal_wait_scacquire(s, HSA_SIGNAL_CONDITION_LT, 1, UINT64_MAX, - HSA_WAIT_STATE_BLOCKED) != 0) { - status = HSA_STATUS_ERROR; - } - } - status = hsa_signal_destroy(s); - } + CHECK_STATUS("hsa_signal_create()", status); + status = hsa_amd_memory_async_copy(dst, cpu_agents_[0], src, agent, size, 0, NULL, s); + CHECK_STATUS("hsa_amd_memory_async_copy()", status); + SignalWait(s); + status = hsa_signal_destroy(s); + CHECK_STATUS("hsa_signal_destroy()", status); } return (status == HSA_STATUS_SUCCESS); } @@ -557,6 +596,7 @@ uint64_t HsaRsrcFactory::Submit(hsa_queue_t* queue, const void* packet, size_t s HsaRsrcFactory* HsaRsrcFactory::instance_ = NULL; HsaRsrcFactory::mutex_t HsaRsrcFactory::mutex_; +HsaRsrcFactory::timestamp_t HsaRsrcFactory::timeout_ns_ = HsaTimer::TIMESTAMP_MAX; } // namespace util } // namespace rocprofiler diff --git a/src/util/hsa_rsrc_factory.h b/src/util/hsa_rsrc_factory.h index b00ee8ed..c76046d2 100644 --- a/src/util/hsa_rsrc_factory.h +++ b/src/util/hsa_rsrc_factory.h @@ -1,24 +1,26 @@ -/****************************************************************************** -Copyright (c) 2018 Advanced Micro Devices, Inc. All rights reserved. - -Permission is hereby granted, free of charge, to any person obtaining a copy -of this software and associated documentation files (the "Software"), to deal -in the Software without restriction, including without limitation the rights -to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -copies of the Software, and to permit persons to whom the Software is -furnished to do so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in -all copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN -THE SOFTWARE. -*******************************************************************************/ +/********************************************************************** +Copyright ©2013 Advanced Micro Devices, Inc. All rights reserved. + +Redistribution and use in source and binary forms, with or without modification, are permitted +provided that the following conditions are met: + +<95> Redistributions of source code must retain the above copyright notice, this list of +conditions and the following disclaimer. +<95> Redistributions in binary form must reproduce the above copyright notice, this list of +conditions and the following disclaimer in the documentation and/or + other materials provided with the distribution. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR +IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT +SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY + DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS + OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, +WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +POSSIBILITY OF SUCH DAMAGE. +********************************************************************/ #ifndef SRC_UTIL_HSA_RSRC_FACTORY_H_ #define SRC_UTIL_HSA_RSRC_FACTORY_H_ @@ -43,21 +45,23 @@ THE SOFTWARE. #define HSA_QUEUE_ALIGN_BYTES 64 #define HSA_PACKET_ALIGN_BYTES 64 -#define CHECK_STATUS(msg, status) \ - if (status != HSA_STATUS_SUCCESS) { \ +#define CHECK_STATUS(msg, status) do { \ + if ((status) != HSA_STATUS_SUCCESS) { \ const char* emsg = 0; \ hsa_status_string(status, &emsg); \ printf("%s: %s\n", msg, emsg ? emsg : ""); \ - exit(1); \ - } + abort(); \ + } \ +} while (0) -#define CHECK_ITER_STATUS(msg, status) \ - if (status != HSA_STATUS_INFO_BREAK) { \ +#define CHECK_ITER_STATUS(msg, status) do { \ + if ((status) != HSA_STATUS_INFO_BREAK) { \ const char* emsg = 0; \ hsa_status_string(status, &emsg); \ printf("%s: %s\n", msg, emsg ? emsg : ""); \ - exit(1); \ - } + abort(); \ + } \ +} while (0) namespace rocprofiler { namespace util { @@ -116,9 +120,42 @@ struct AgentInfo { uint32_t shader_arrays_per_se; }; +// HSA timer class +// Provides current HSA timestampa and system-clock/ns conversion API +class HsaTimer { + public: + typedef uint64_t timestamp_t; + static const timestamp_t TIMESTAMP_MAX = UINT64_MAX; + typedef long double freq_t; + + HsaTimer() { + timestamp_t sysclock_hz = 0; + hsa_status_t status = hsa_system_get_info(HSA_SYSTEM_INFO_TIMESTAMP_FREQUENCY, &sysclock_hz); + CHECK_STATUS("hsa_system_get_info(HSA_SYSTEM_INFO_TIMESTAMP_FREQUENCY)", status); + sysclock_factor_ = (freq_t)1000000000 / (freq_t)sysclock_hz; + } + + // Methids for system-clock/ns conversion + timestamp_t sysclock_to_ns(const timestamp_t& sysclock) const { return timestamp_t((freq_t)sysclock * sysclock_factor_); } + timestamp_t ns_to_sysclock(const timestamp_t& time) const { return timestamp_t((freq_t)time / sysclock_factor_); } + + // Return timestamp in 'ns' + timestamp_t timestamp_ns() const { + timestamp_t sysclock; + hsa_status_t status = hsa_system_get_info(HSA_SYSTEM_INFO_TIMESTAMP, &sysclock); + CHECK_STATUS("hsa_system_get_info(HSA_SYSTEM_INFO_TIMESTAMP)", status); + return sysclock_to_ns(sysclock); + } + + private: + // Timestamp frequency factor + freq_t sysclock_factor_; +}; + class HsaRsrcFactory { public: typedef std::recursive_mutex mutex_t; + typedef HsaTimer::timestamp_t timestamp_t; static HsaRsrcFactory* Create(bool initialize_hsa = true) { std::lock_guard lck(mutex_); @@ -204,6 +241,12 @@ class HsaRsrcFactory { // @return uint8_t* Pointer to buffer, null if allocation fails. uint8_t* AllocateCmdMemory(const AgentInfo* agent_info, size_t size); + // Wait signal + void SignalWait(const hsa_signal_t& signal) const; + + // Wait signal with signal value restore + void SignalWaitRestore(const hsa_signal_t& signal, const hsa_signal_value_t& signal_value) const; + // Copy data from GPU to host memory bool Memcpy(const hsa_agent_t& agent, void* dst, const void* src, size_t size); bool Memcpy(const AgentInfo* agent_info, void* dst, const void* src, size_t size); @@ -235,6 +278,19 @@ class HsaRsrcFactory { // Return Loader API table const hsa_ven_amd_loader_1_00_pfn_t* LoaderApi() const { return &loader_api_; } + // Methods for system-clock/ns conversion and timestamp in 'ns' + timestamp_t SysclockToNs(const timestamp_t& sysclock) const { return timer_->sysclock_to_ns(sysclock); } + timestamp_t NsToSysclock(const timestamp_t& time) const { return timer_->ns_to_sysclock(time); } + timestamp_t TimestampNs() const { return timer_->timestamp_ns(); } + + timestamp_t GetSysTimeout() const { return timeout_; } + static timestamp_t GetTimeoutNs() { return timeout_ns_; } + static void SetTimeoutNs(const timestamp_t& time) { + std::lock_guard lck(mutex_); + timeout_ns_ = time; + if (instance_ != NULL) instance_->timeout_ = instance_->timer_->ns_to_sysclock(time); + } + private: // System agents iterating callback static hsa_status_t GetHsaAgentsCallback(hsa_agent_t agent, void* data); @@ -280,6 +336,18 @@ class HsaRsrcFactory { // Loader API table hsa_ven_amd_loader_1_00_pfn_t loader_api_; + + // System timeout, ns + static timestamp_t timeout_ns_; + // System timeout, sysclock + timestamp_t timeout_; + + // HSA timer + HsaTimer* timer_; + + // CPU/kern-arg memory pools + hsa_amd_memory_pool_t *cpu_pool_; + hsa_amd_memory_pool_t *kern_arg_pool_; }; } // namespace util diff --git a/src/util/logger.h b/src/util/logger.h index 97477899..d37f6567 100644 --- a/src/util/logger.h +++ b/src/util/logger.h @@ -66,6 +66,7 @@ class Logger { static void begm() { Instance().ResetStreaming(true); } static void endl() { Instance().ResetStreaming(false); } + static void errm() { Instance().SetError(); } static const std::string& LastMessage() { Logger& logger = Instance(); @@ -94,19 +95,27 @@ class Logger { static uint32_t GetPid() { return syscall(__NR_getpid); } static uint32_t GetTid() { return syscall(__NR_gettid); } - Logger() : file_(NULL), dirty_(false), streaming_(false), messaging_(false) { - const char* path = getenv("ROCPROFILER_LOG"); - if (path != NULL) { - file_ = fopen("/tmp/rocprofiler_log.txt", "a"); + Logger() : file_(NULL), session_file_(NULL), dirty_(false), streaming_(false), messaging_(false), error_(false) { + const char* var = getenv("ROCPROFILER_LOG"); + if (var != NULL) file_ = fopen("/tmp/rocprofiler_log.txt", "a"); + + var = getenv("ROCPROFILER_SESS"); + if (var != NULL) { + std::string dir = var; + if (dir.back() != '/') dir.push_back('/'); + std::string name = dir + "log.txt"; + session_file_ = fopen(name.c_str(), "a"); + if (session_file_ != NULL) session_dir_ = dir; + else std::cerr << "ROCProfiler: cannot create session log '" << name << "'" << std::endl << std::flush; } + ResetStreaming(false); } ~Logger() { - if (file_ != NULL) { - if (dirty_) Put("\n"); - fclose(file_); - } + if (dirty_) Put("\n"); + if (file_ != NULL) fclose(file_); + if (session_file_ != NULL) fclose(session_file_); } void ResetStreaming(const bool messaging) { @@ -129,8 +138,15 @@ class Logger { if (file_ != NULL) { dirty_ = true; flock(fileno(file_), LOCK_EX); + fprintf(file_, "%s", m.c_str()); fflush(file_); + + if (session_file_ != NULL) { + fprintf(session_file_, "%s", m.c_str()); + fflush(session_file_); + } + flock(fileno(file_), LOCK_UN); } } @@ -146,10 +162,23 @@ class Logger { Put(oss.str()); } + void SetError() { + std::lock_guard lck(mutex_); + if (error_ == false) { + error_ = true; + if (session_dir_.empty() == false) { + auto x = fopen(std::string(session_dir_ + "error").c_str(), "w"); (void)x; + } + } + } + FILE* file_; + FILE* session_file_; bool dirty_; bool streaming_; bool messaging_; + bool error_; + std::string session_dir_; static mutex_t mutex_; static Logger* instance_; @@ -160,32 +189,33 @@ class Logger { } // namespace rocprofiler #define ERR_LOGGING(stream) \ - { \ - rocprofiler::util::Logger::Instance() << "error: " << rocprofiler::util::Logger::begm \ + do { \ + rocprofiler::util::Logger::Instance() << rocprofiler::util::Logger::errm \ + << "error: " << rocprofiler::util::Logger::begm \ << stream << rocprofiler::util::Logger::endl; \ - } + } while(0) #define INFO_LOGGING(stream) \ - { \ + do { \ rocprofiler::util::Logger::Instance() << "info: " << rocprofiler::util::Logger::begm << stream \ << rocprofiler::util::Logger::endl; \ - } + } while(0) #define WARN_LOGGING(stream) \ - { \ - std::cerr << "ROCProfiler: " << stream << std::endl; \ + do { \ + std::cerr << "ROCProfiler: " << stream << std::endl; \ rocprofiler::util::Logger::Instance() << "warning: " << rocprofiler::util::Logger::begm << stream \ << rocprofiler::util::Logger::endl; \ - } + } while(0) #ifdef DEBUG #define DBG_LOGGING(stream) \ - { \ + do { \ rocprofiler::util::Logger::Instance() << rocprofiler::util::Logger::begm << "debug: \"" \ << stream << "\"" < < < < \ " in " << __FUNCTION__ << " at " << __FILE__ << " line " << __LINE__ \ << rocprofiler::util::Logger::endl; \ - } + } while(0) #endif #endif // SRC_UTIL_LOGGER_H_ diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index 278bc5c4..2f35639d 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -1,16 +1,16 @@ ################################################################################ # Copyright (c) 2018 Advanced Micro Devices, Inc. All rights reserved. -# +# # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal # in the Software without restriction, including without limitation the rights # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell # copies of the Software, and to permit persons to whom the Software is # furnished to do so, subject to the following conditions: -# +# # The above copyright notice and this permission notice shall be included in # all copies or substantial portions of the Software. -# +# # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE @@ -20,7 +20,7 @@ # THE SOFTWARE. ################################################################################ -cmake_minimum_required ( VERSION 3.5.0 ) +cmake_minimum_required ( VERSION 2.8.12 ) set ( CMAKE_VERBOSE_MAKEFILE TRUE CACHE BOOL "Verbose Output" FORCE ) set ( EXE_NAME "ctrl" ) @@ -49,7 +49,7 @@ execute_process ( COMMAND sh -xc "cp ${TEST_DIR}/${TEST_NAME}/*.hsaco ${PROJECT_ ## Building test executable add_executable ( ${EXE_NAME} ${KERN_SRC} ${CTRL_SRC} ${UTIL_SRC} ) target_include_directories ( ${EXE_NAME} PRIVATE ${TEST_DIR} ${ROOT_DIR} ${HSA_RUNTIME_INC_PATH} ) -target_link_libraries( ${EXE_NAME} ${HSA_RUNTIME_LIB} ${HSA_KMT_LIB} c stdc++ dl pthread rt atomic ) +target_link_libraries( ${EXE_NAME} ${HSA_RUNTIME_LIB} ${HSA_KMT_LIB} c stdc++ dl pthread rt ) execute_process ( COMMAND sh -xc "cp ${TEST_DIR}/run.sh ${PROJECT_BINARY_DIR}" ) execute_process ( COMMAND sh -xc "cp ${TEST_DIR}/tool/*.xml ${PROJECT_BINARY_DIR}" ) execute_process ( COMMAND sh -xc "mkdir -p ${PROJECT_BINARY_DIR}/RESULTS" ) @@ -59,4 +59,4 @@ set ( TEST_LIB "tool" ) set ( TEST_LIB_SRC ${TEST_DIR}/tool/tool.cpp ${UTIL_SRC} ) add_library ( ${TEST_LIB} SHARED ${TEST_LIB_SRC} ) target_include_directories ( ${TEST_LIB} PRIVATE ${TEST_DIR} ${ROOT_DIR} ${HSA_RUNTIME_INC_PATH} ) -target_link_libraries ( ${TEST_LIB} ${ROCPROFILER_TARGET} ${HSA_RUNTIME_LIB} c stdc++ dl pthread rt atomic ) +target_link_libraries ( ${TEST_LIB} ${ROCPROFILER_TARGET} ${HSA_RUNTIME_LIB} c stdc++ dl pthread rt ) diff --git a/test/run.sh b/test/run.sh index 037b47a2..3ac292e6 100755 --- a/test/run.sh +++ b/test/run.sh @@ -1,16 +1,16 @@ ################################################################################ # Copyright (c) 2018 Advanced Micro Devices, Inc. All rights reserved. -# +# # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal # in the Software without restriction, including without limitation the rights # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell # copies of the Software, and to permit persons to whom the Software is # furnished to do so, subject to the following conditions: -# +# # The above copyright notice and this permission notice shall be included in # all copies or substantial portions of the Software. -# +# # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE @@ -36,6 +36,8 @@ export ROCP_TOOL_LIB=libtool.so unset ROCP_PROXY_QUEUE # ROC profiler metrics config file export ROCP_METRICS=metrics.xml +# ROC profiler kernels timing +export ROCP_TIMESTAMP_ON=1 # output directory for the tool library, for metrics results file 'results.txt' export ROCP_OUTPUT_DIR=./RESULTS @@ -54,6 +56,11 @@ export ROCP_DITER=100 export ROCP_INPUT=input.xml eval $tbin +#export ROCP_KITER=1 +#export ROCP_DITER=4 +#export ROCP_INPUT=input1.xml +#eval $tbin + #valgrind --leak-check=full $tbin #valgrind --tool=massif $tbin #ms_print massif.out. diff --git a/test/tool/input1.xml b/test/tool/input1.xml new file mode 100644 index 00000000..254c83dc --- /dev/null +++ b/test/tool/input1.xml @@ -0,0 +1,5 @@ +# List of metrics + diff --git a/test/tool/tool.cpp b/test/tool/tool.cpp index 0eb79940..65385078 100644 --- a/test/tool/tool.cpp +++ b/test/tool/tool.cpp @@ -37,11 +37,14 @@ THE SOFTWARE. #include #include +#include +#include #include #include #include #include #include +#include #include #include "inc/rocprofiler.h" @@ -68,7 +71,8 @@ struct callbacks_data_t { // Context stored entry type struct context_entry_t { - uint32_t valid; + bool valid; + bool active; uint32_t index; hsa_agent_t agent; rocprofiler_group_t group; @@ -93,8 +97,6 @@ callbacks_data_t* callbacks_data = NULL; // Stored contexts array typedef std::map context_array_t; context_array_t* context_array = NULL; -typedef std::list wait_list_t; -wait_list_t* wait_list = NULL; // Contexts collected count volatile uint32_t context_count = 0; volatile uint32_t context_collected = 0; @@ -169,9 +171,9 @@ std::string filtr_kernel_name(const std::string name) { } ++rit; } - while (((*rit == ' ') || (*rit == ' ')) && (rit != rend)) rit++; + while (rit != rend) if ((*rit == ' ') || (*rit == ' ')) rit++; else break; auto rbeg = rit; - while ((*rit != ' ') && (*rit != ':') && (rit != rend)) rit++; + while (rit != rend) if ((*rit != ' ') && (*rit != ':')) rit++; else break; const uint32_t pos = rend - rit; const uint32_t length = rit - rbeg; return name.substr(pos, length); @@ -382,11 +384,12 @@ void output_group(const context_entry_t* entry, const char* label) { } } -// Dump stored context profiling output data -bool dump_context(context_entry_t* entry) { +// Dump stored context entry +bool dump_context_entry(context_entry_t* entry) { hsa_status_t status = HSA_STATUS_ERROR; - if (entry->valid == 0) return true; + volatile std::atomic* valid = reinterpret_cast*>(&entry->valid); + while (valid->load() == false) sched_yield(); const rocprofiler_dispatch_record_t* record = entry->data.record; if (record) { @@ -436,65 +439,48 @@ bool dump_context(context_entry_t* entry) { rocprofiler_close(group.context); } - entry->valid = 0; return true; } -// Dump and clean a given context entry -static inline bool dump_context_entry(context_entry_t* entry) { - const bool ret = dump_context(entry); - if (ret) dealloc_context_entry(entry); - return ret; -} - -// Dump waiting entries -static inline void dump_wait_list() { - if (pthread_mutex_lock(&mutex) != 0) { - perror("pthread_mutex_lock"); - abort(); - } - - auto it = wait_list->begin(); - auto end = wait_list->end(); - while (it != end) { - auto cur = it++; - if (dump_context_entry(*cur)) { - wait_list->erase(cur); +// Wait for and dump all stored contexts for a given queue if not NULL +void dump_context_array(hsa_queue_t* queue) { + bool done = false; + while (done == false) { + done = true; + if (pthread_mutex_lock(&mutex) != 0) { + perror("pthread_mutex_lock"); + abort(); } - } - if (pthread_mutex_unlock(&mutex) != 0) { - perror("pthread_mutex_unlock"); - abort(); - } -} - -// Dump all stored contexts profiling output data -void dump_context_array() { - if (pthread_mutex_lock(&mutex) != 0) { - perror("pthread_mutex_lock"); - abort(); - } - - if (context_array) { - if (!wait_list->empty()) dump_wait_list(); + if (context_array) { + auto it = context_array->begin(); + auto end = context_array->end(); + while (it != end) { + auto cur = it++; + context_entry_t* entry = &(cur->second); + volatile std::atomic* valid = reinterpret_cast*>(&entry->valid); + while (valid->load() == false) sched_yield(); + if ((queue == NULL) || (entry->data.queue == queue)) { + if (entry->active == true) { + if (dump_context_entry(&(cur->second)) == false) done = false; + else entry->active = false; + } + } + } - auto it = context_array->begin(); - auto end = context_array->end(); - while (it != end) { - auto cur = it++; - dump_context(&(cur->second)); + if (pthread_mutex_unlock(&mutex) != 0) { + perror("pthread_mutex_unlock"); + abort(); + } + if (done == false) sched_yield(); } } - - if (pthread_mutex_unlock(&mutex) != 0) { - perror("pthread_mutex_unlock"); - abort(); - } } // Profiling completion handler -bool handler(rocprofiler_group_t group, void* arg) { +// Dump and delete the context entry +// Return true if the context was dumped successfully +bool context_handler(rocprofiler_group_t group, void* arg) { context_entry_t* entry = reinterpret_cast(arg); if (pthread_mutex_lock(&mutex) != 0) { @@ -502,11 +488,15 @@ bool handler(rocprofiler_group_t group, void* arg) { abort(); } - if (!wait_list->empty()) dump_wait_list(); - - if (!dump_context_entry(entry)) { - wait_list->push_back(entry); + bool ret = true; + if (entry->active == true) { + ret = dump_context_entry(entry); + if (ret == false) { + fprintf(stderr, "tool error: context is not complete\n"); + abort(); + } } + if (ret) dealloc_context_entry(entry); if (trace_on) { fprintf(stdout, "tool::handler: context_array %d tid %u\n", (int)(context_array->size()), GetTid()); @@ -577,7 +567,7 @@ hsa_status_t dispatch_callback(const rocprofiler_callback_data_t* callback_data, context_entry_t* entry = alloc_context_entry(); // context properties rocprofiler_properties_t properties{}; - properties.handler = (result_prefix != NULL) ? handler : NULL; + properties.handler = (result_prefix != NULL) ? context_handler : NULL; properties.handler_arg = (void*)entry; rocprofiler_feature_t* features = tool_data->features; @@ -598,22 +588,20 @@ hsa_status_t dispatch_callback(const rocprofiler_callback_data_t* callback_data, feature_count = next_offset - set_offset; } - if (tool_data->feature_count > 0) { - // Open profiling context - status = rocprofiler_open(callback_data->agent, features, feature_count, - &context, 0 /*ROCPROFILER_MODE_SINGLEGROUP*/, &properties); - check_status(status); - - // Check that we have only one profiling group - uint32_t group_count = 0; - status = rocprofiler_group_count(context, &group_count); - check_status(status); - assert(group_count == 1); - // Get group[0] - const uint32_t group_index = 0; - status = rocprofiler_get_group(context, group_index, group); - check_status(status); - } + // Open profiling context + status = rocprofiler_open(callback_data->agent, features, feature_count, + &context, 0 /*ROCPROFILER_MODE_SINGLEGROUP*/, &properties); + check_status(status); + + // Check that we have only one profiling group + uint32_t group_count = 0; + status = rocprofiler_group_count(context, &group_count); + check_status(status); + assert(group_count == 1); + // Get group[0] + const uint32_t group_index = 0; + status = rocprofiler_get_group(context, group_index, group); + check_status(status); // Fill profiling context entry entry->agent = callback_data->agent; @@ -623,7 +611,8 @@ hsa_status_t dispatch_callback(const rocprofiler_callback_data_t* callback_data, entry->data = *callback_data; entry->data.kernel_name = strdup(callback_data->kernel_name); entry->file_handle = tool_data->file_handle; - entry->valid = 1; + entry->active = true; + reinterpret_cast*>(&entry->valid)->store(true); if (trace_on) { fprintf(stdout, "tool::dispatch: context_array %d tid %u\n", (int)(context_array->size()), GetTid()); @@ -635,7 +624,7 @@ hsa_status_t dispatch_callback(const rocprofiler_callback_data_t* callback_data, hsa_status_t destroy_callback(hsa_queue_t* queue, void*) { if (result_file_opened == false) printf("\nROCProfiler results:\n"); - dump_context_array(); + dump_context_array(queue); return HSA_STATUS_SUCCESS; } @@ -644,8 +633,16 @@ static hsa_status_t info_callback(const rocprofiler_info_data_t info, void * arg if (((symb == 'b') && (info.metric.expr == NULL)) || ((symb == 'd') && (info.metric.expr != NULL))) { - printf("\n gpu-agent%d : %s : %s\n", info.agent_index, info.metric.name, info.metric.description); - if (info.metric.expr != NULL) printf(" %s = %s\n", info.metric.name, info.metric.expr); + if (info.metric.expr != NULL) { + fprintf(stdout, "\n gpu-agent%d : %s : %s\n", info.agent_index, info.metric.name, info.metric.description); + fprintf(stdout, " %s = %s\n", info.metric.name, info.metric.expr); + } else { + fprintf(stdout, "\n gpu-agent%d : %s", info.agent_index, info.metric.name); + if (info.metric.instances > 1) fprintf(stdout, "[0-%u]", info.metric.instances - 1); + fprintf(stdout, " : %s\n", info.metric.description); + fprintf(stdout, " block %s has %u counters\n", info.metric.block_name, info.metric.block_counters); + } + fflush(stdout); } return HSA_STATUS_SUCCESS; } @@ -800,7 +797,8 @@ extern "C" PUBLIC_API void OnLoadToolProp(rocprofiler_settings_t* settings) } else { if (*info_symb == 'b') printf("Basic HW counters:\n"); else printf("Derived metrics:\n"); - rocprofiler_iterate_info(NULL, ROCPROFILER_INFO_KIND_METRIC, info_callback, info_symb); + hsa_status_t status = rocprofiler_iterate_info(NULL, ROCPROFILER_INFO_KIND_METRIC, info_callback, info_symb); + check_status(status); } exit(1); } @@ -917,8 +915,10 @@ extern "C" PUBLIC_API void OnLoadToolProp(rocprofiler_settings_t* settings) HSA_VEN_AMD_AQLPROFILE_PARAMETER_NAME_TOKEN_MASK; parameters_dict["TOKEN_MASK2"] = HSA_VEN_AMD_AQLPROFILE_PARAMETER_NAME_TOKEN_MASK2; +#ifdef AQLPROF_NEW_API parameters_dict["SE_MASK"] = HSA_VEN_AMD_AQLPROFILE_PARAMETER_NAME_SE_MASK; +#endif printf(" %s (", name.c_str()); features[index] = {}; @@ -956,7 +956,6 @@ extern "C" PUBLIC_API void OnLoadToolProp(rocprofiler_settings_t* settings) // Context array aloocation context_array = new context_array_t; - wait_list = new wait_list_t; // Adding dispatch observer rocprofiler_queue_callbacks_t callbacks_ptrs{0}; @@ -1007,21 +1006,13 @@ extern "C" PUBLIC_API void OnUnloadTool() { rocprofiler_remove_queue_callbacks(); // Dump stored profiling output data - printf("\nROCPRofiler: %u contexts collected", context_collected); - if (result_file_opened) printf(", output directory %s", result_prefix); - printf("\n"); fflush(stdout); - dump_context_array(); - if (wait_list) { - if (!wait_list->empty()) { - printf("\nWaiting for pending kernels ..."); fflush(stdout); - while (wait_list->size() != 0) { - usleep(1000); - dump_wait_list(); - } - printf(".done\n"); fflush(stdout); - } + printf("\nROCPRofiler: %u contexts collected", context_collected); fflush(stdout); + dump_context_array(NULL); + if (result_file_opened) { + fclose(result_file_handle); + printf(", output directory %s", result_prefix); } - if (result_file_opened) fclose(result_file_handle); + printf("\n"); fflush(stdout); // Cleanup if (callbacks_data != NULL) { @@ -1039,8 +1030,6 @@ extern "C" PUBLIC_API void OnUnloadTool() { range_vec = NULL; delete context_array; context_array = NULL; - delete wait_list; - wait_list = NULL; } extern "C" DESTRUCTOR_API void destructor() { diff --git a/test/util/hsa_rsrc_factory.cpp b/test/util/hsa_rsrc_factory.cpp index 5116a3a8..5404608b 100644 --- a/test/util/hsa_rsrc_factory.cpp +++ b/test/util/hsa_rsrc_factory.cpp @@ -1,24 +1,26 @@ -/****************************************************************************** -Copyright (c) 2018 Advanced Micro Devices, Inc. All rights reserved. - -Permission is hereby granted, free of charge, to any person obtaining a copy -of this software and associated documentation files (the "Software"), to deal -in the Software without restriction, including without limitation the rights -to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -copies of the Software, and to permit persons to whom the Software is -furnished to do so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in -all copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN -THE SOFTWARE. -*******************************************************************************/ +/********************************************************************** +Copyright ©2013 Advanced Micro Devices, Inc. All rights reserved. + +Redistribution and use in source and binary forms, with or without modification, are permitted +provided that the following conditions are met: + +<95> Redistributions of source code must retain the above copyright notice, this list of +conditions and the following disclaimer. +<95> Redistributions in binary form must reproduce the above copyright notice, this list of +conditions and the following disclaimer in the documentation and/or + other materials provided with the distribution. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR +IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT +SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY + DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS + OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, +WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +POSSIBILITY OF SUCH DAMAGE. +********************************************************************/ #include "util/hsa_rsrc_factory.h" @@ -105,14 +107,21 @@ hsa_status_t FindKernArgPool(hsa_amd_memory_pool_t pool, void* data) { // Constructor of the class HsaRsrcFactory::HsaRsrcFactory(bool initialize_hsa) : initialize_hsa_(initialize_hsa) { hsa_status_t status; + + cpu_pool_ = NULL; + kern_arg_pool_ = NULL; + // Initialize the Hsa Runtime if (initialize_hsa_) { status = hsa_init(); CHECK_STATUS("Error in hsa_init", status); } + // Discover the set of Gpu devices available on the platform status = hsa_iterate_agents(GetHsaAgentsCallback, this); CHECK_STATUS("Error Calling hsa_iterate_agents", status); + if (cpu_pool_ == NULL) CHECK_STATUS("CPU memory pool is not found", HSA_STATUS_ERROR); + if (kern_arg_pool_ == NULL) CHECK_STATUS("Kern-arg memory pool is not found", HSA_STATUS_ERROR); // Get AqlProfile API table aqlprofile_api_ = {0}; @@ -127,10 +136,19 @@ HsaRsrcFactory::HsaRsrcFactory(bool initialize_hsa) : initialize_hsa_(initialize loader_api_ = {0}; status = hsa_system_get_extension_table(HSA_EXTENSION_AMD_LOADER, 1, 0, &loader_api_); CHECK_STATUS("loader API table query failed", status); + + // Instantiate HSA timer + timer_ = new HsaTimer; + CHECK_STATUS("HSA timer allocation failed", + (timer_ == NULL) ? HSA_STATUS_ERROR : HSA_STATUS_SUCCESS); + + // System timeout + timeout_ = (timeout_ns_ == HsaTimer::TIMESTAMP_MAX) ? timeout_ns_ : timer_->ns_to_sysclock(timeout_ns_); } // Destructor of the class HsaRsrcFactory::~HsaRsrcFactory() { + delete timer_; for (auto p : cpu_list_) delete p; for (auto p : gpu_list_) delete p; if (initialize_hsa_) { @@ -157,8 +175,10 @@ hsa_status_t HsaRsrcFactory::LoadAqlProfileLib(aqlprofile_pfn_t* api) { (decltype(::hsa_ven_amd_aqlprofile_start)*)dlsym(handle, "hsa_ven_amd_aqlprofile_start"); api->hsa_ven_amd_aqlprofile_stop = (decltype(::hsa_ven_amd_aqlprofile_stop)*)dlsym(handle, "hsa_ven_amd_aqlprofile_stop"); +#ifdef AQLPROF_NEW_API api->hsa_ven_amd_aqlprofile_read = (decltype(::hsa_ven_amd_aqlprofile_read)*)dlsym(handle, "hsa_ven_amd_aqlprofile_read"); +#endif api->hsa_ven_amd_aqlprofile_legacy_get_pm4 = (decltype(::hsa_ven_amd_aqlprofile_legacy_get_pm4)*)dlsym( handle, "hsa_ven_amd_aqlprofile_legacy_get_pm4"); @@ -188,9 +208,9 @@ const AgentInfo* HsaRsrcFactory::AddAgentInfo(const hsa_agent_t agent) { agent_info->dev_index = cpu_list_.size(); status = hsa_amd_agent_iterate_memory_pools(agent, FindStandardPool, &agent_info->cpu_pool); - CHECK_ITER_STATUS("hsa_amd_agent_iterate_memory_pools(cpu pool)", status); + if ((status == HSA_STATUS_INFO_BREAK) && (cpu_pool_ == NULL)) cpu_pool_ = &agent_info->cpu_pool; status = hsa_amd_agent_iterate_memory_pools(agent, FindKernArgPool, &agent_info->kern_arg_pool); - CHECK_ITER_STATUS("hsa_amd_agent_iterate_memory_pools(kern arg pool)", status); + if ((status == HSA_STATUS_INFO_BREAK) && (kern_arg_pool_ == NULL)) kern_arg_pool_ = &agent_info->kern_arg_pool; agent_info->gpu_pool = {}; cpu_list_.push_back(agent_info); @@ -352,7 +372,7 @@ uint8_t* HsaRsrcFactory::AllocateKernArgMemory(const AgentInfo* agent_info, size uint8_t* buffer = NULL; if (!cpu_agents_.empty()) { size = (size + MEM_PAGE_MASK) & ~MEM_PAGE_MASK; - status = hsa_amd_memory_pool_allocate(cpu_list_[0]->kern_arg_pool, size, 0, reinterpret_cast(&buffer)); + status = hsa_amd_memory_pool_allocate(*kern_arg_pool_, size, 0, reinterpret_cast(&buffer)); // Both the CPU and GPU can access the kernel arguments if (status == HSA_STATUS_SUCCESS) { hsa_agent_t ag_list[1] = {agent_info->dev_id}; @@ -372,7 +392,7 @@ uint8_t* HsaRsrcFactory::AllocateSysMemory(const AgentInfo* agent_info, size_t s uint8_t* buffer = NULL; size = (size + MEM_PAGE_MASK) & ~MEM_PAGE_MASK; if (!cpu_agents_.empty()) { - status = hsa_amd_memory_pool_allocate(cpu_list_[0]->cpu_pool, size, 0, reinterpret_cast(&buffer)); + status = hsa_amd_memory_pool_allocate(*cpu_pool_, size, 0, reinterpret_cast(&buffer)); // Both the CPU and GPU can access the memory if (status == HSA_STATUS_SUCCESS) { hsa_agent_t ag_list[1] = {agent_info->dev_id}; @@ -396,22 +416,37 @@ uint8_t* HsaRsrcFactory::AllocateCmdMemory(const AgentInfo* agent_info, size_t s return ptr; } +// Wait signal +void HsaRsrcFactory::SignalWait(const hsa_signal_t& signal) const { + while (1) { + const hsa_signal_value_t signal_value = + hsa_signal_wait_scacquire(signal, HSA_SIGNAL_CONDITION_LT, 1, timeout_, HSA_WAIT_STATE_BLOCKED); + if (signal_value == 0) { + break; + } else { + CHECK_STATUS("hsa_signal_wait_scacquire()", HSA_STATUS_ERROR); + } + } +} + +// Wait signal with signal value restore +void HsaRsrcFactory::SignalWaitRestore(const hsa_signal_t& signal, const hsa_signal_value_t& signal_value) const { + SignalWait(signal); + hsa_signal_store_relaxed(const_cast(signal), signal_value); +} + // Copy data from GPU to host memory bool HsaRsrcFactory::Memcpy(const hsa_agent_t& agent, void* dst, const void* src, size_t size) { hsa_status_t status = HSA_STATUS_ERROR; if (!cpu_agents_.empty()) { hsa_signal_t s = {}; status = hsa_signal_create(1, 0, NULL, &s); - if (status == HSA_STATUS_SUCCESS) { - status = hsa_amd_memory_async_copy(dst, cpu_agents_[0], src, agent, size, 0, NULL, s); - if (status == HSA_STATUS_SUCCESS) { - if (hsa_signal_wait_scacquire(s, HSA_SIGNAL_CONDITION_LT, 1, UINT64_MAX, - HSA_WAIT_STATE_BLOCKED) != 0) { - status = HSA_STATUS_ERROR; - } - } - status = hsa_signal_destroy(s); - } + CHECK_STATUS("hsa_signal_create()", status); + status = hsa_amd_memory_async_copy(dst, cpu_agents_[0], src, agent, size, 0, NULL, s); + CHECK_STATUS("hsa_amd_memory_async_copy()", status); + SignalWait(s); + status = hsa_signal_destroy(s); + CHECK_STATUS("hsa_signal_destroy()", status); } return (status == HSA_STATUS_SUCCESS); } @@ -554,3 +589,4 @@ uint64_t HsaRsrcFactory::Submit(hsa_queue_t* queue, const void* packet, size_t s HsaRsrcFactory* HsaRsrcFactory::instance_ = NULL; HsaRsrcFactory::mutex_t HsaRsrcFactory::mutex_; +HsaRsrcFactory::timestamp_t HsaRsrcFactory::timeout_ns_ = HsaTimer::TIMESTAMP_MAX; diff --git a/test/util/hsa_rsrc_factory.h b/test/util/hsa_rsrc_factory.h index e7dcc559..c9466f89 100644 --- a/test/util/hsa_rsrc_factory.h +++ b/test/util/hsa_rsrc_factory.h @@ -1,24 +1,26 @@ -/****************************************************************************** -Copyright (c) 2018 Advanced Micro Devices, Inc. All rights reserved. - -Permission is hereby granted, free of charge, to any person obtaining a copy -of this software and associated documentation files (the "Software"), to deal -in the Software without restriction, including without limitation the rights -to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -copies of the Software, and to permit persons to whom the Software is -furnished to do so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in -all copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN -THE SOFTWARE. -*******************************************************************************/ +/********************************************************************** +Copyright ©2013 Advanced Micro Devices, Inc. All rights reserved. + +Redistribution and use in source and binary forms, with or without modification, are permitted +provided that the following conditions are met: + +<95> Redistributions of source code must retain the above copyright notice, this list of +conditions and the following disclaimer. +<95> Redistributions in binary form must reproduce the above copyright notice, this list of +conditions and the following disclaimer in the documentation and/or + other materials provided with the distribution. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR +IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT +SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY + DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS + OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, +WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +POSSIBILITY OF SUCH DAMAGE. +********************************************************************/ #ifndef TEST_UTIL_HSA_RSRC_FACTORY_H_ #define TEST_UTIL_HSA_RSRC_FACTORY_H_ @@ -43,21 +45,23 @@ THE SOFTWARE. #define HSA_QUEUE_ALIGN_BYTES 64 #define HSA_PACKET_ALIGN_BYTES 64 -#define CHECK_STATUS(msg, status) \ - if (status != HSA_STATUS_SUCCESS) { \ +#define CHECK_STATUS(msg, status) do { \ + if ((status) != HSA_STATUS_SUCCESS) { \ const char* emsg = 0; \ hsa_status_string(status, &emsg); \ printf("%s: %s\n", msg, emsg ? emsg : ""); \ - exit(1); \ - } + abort(); \ + } \ +} while (0) -#define CHECK_ITER_STATUS(msg, status) \ - if (status != HSA_STATUS_INFO_BREAK) { \ +#define CHECK_ITER_STATUS(msg, status) do { \ + if ((status) != HSA_STATUS_INFO_BREAK) { \ const char* emsg = 0; \ hsa_status_string(status, &emsg); \ printf("%s: %s\n", msg, emsg ? emsg : ""); \ - exit(1); \ - } + abort(); \ + } \ +} while (0) static const size_t MEM_PAGE_BYTES = 0x1000; static const size_t MEM_PAGE_MASK = MEM_PAGE_BYTES - 1; @@ -114,9 +118,42 @@ struct AgentInfo { uint32_t shader_arrays_per_se; }; +// HSA timer class +// Provides current HSA timestampa and system-clock/ns conversion API +class HsaTimer { + public: + typedef uint64_t timestamp_t; + static const timestamp_t TIMESTAMP_MAX = UINT64_MAX; + typedef long double freq_t; + + HsaTimer() { + timestamp_t sysclock_hz = 0; + hsa_status_t status = hsa_system_get_info(HSA_SYSTEM_INFO_TIMESTAMP_FREQUENCY, &sysclock_hz); + CHECK_STATUS("hsa_system_get_info(HSA_SYSTEM_INFO_TIMESTAMP_FREQUENCY)", status); + sysclock_factor_ = (freq_t)1000000000 / (freq_t)sysclock_hz; + } + + // Methids for system-clock/ns conversion + timestamp_t sysclock_to_ns(const timestamp_t& sysclock) const { return timestamp_t((freq_t)sysclock * sysclock_factor_); } + timestamp_t ns_to_sysclock(const timestamp_t& time) const { return timestamp_t((freq_t)time / sysclock_factor_); } + + // Return timestamp in 'ns' + timestamp_t timestamp_ns() const { + timestamp_t sysclock; + hsa_status_t status = hsa_system_get_info(HSA_SYSTEM_INFO_TIMESTAMP, &sysclock); + CHECK_STATUS("hsa_system_get_info(HSA_SYSTEM_INFO_TIMESTAMP)", status); + return sysclock_to_ns(sysclock); + } + + private: + // Timestamp frequency factor + freq_t sysclock_factor_; +}; + class HsaRsrcFactory { public: typedef std::recursive_mutex mutex_t; + typedef HsaTimer::timestamp_t timestamp_t; static HsaRsrcFactory* Create(bool initialize_hsa = true) { std::lock_guard lck(mutex_); @@ -202,6 +239,12 @@ class HsaRsrcFactory { // @return uint8_t* Pointer to buffer, null if allocation fails. uint8_t* AllocateCmdMemory(const AgentInfo* agent_info, size_t size); + // Wait signal + void SignalWait(const hsa_signal_t& signal) const; + + // Wait signal with signal value restore + void SignalWaitRestore(const hsa_signal_t& signal, const hsa_signal_value_t& signal_value) const; + // Copy data from GPU to host memory bool Memcpy(const hsa_agent_t& agent, void* dst, const void* src, size_t size); bool Memcpy(const AgentInfo* agent_info, void* dst, const void* src, size_t size); @@ -233,6 +276,19 @@ class HsaRsrcFactory { // Return Loader API table const hsa_ven_amd_loader_1_00_pfn_t* LoaderApi() const { return &loader_api_; } + // Methods for system-clock/ns conversion and timestamp in 'ns' + timestamp_t SysclockToNs(const timestamp_t& sysclock) const { return timer_->sysclock_to_ns(sysclock); } + timestamp_t NsToSysclock(const timestamp_t& time) const { return timer_->ns_to_sysclock(time); } + timestamp_t TimestampNs() const { return timer_->timestamp_ns(); } + + timestamp_t GetSysTimeout() const { return timeout_; } + static timestamp_t GetTimeoutNs() { return timeout_ns_; } + static void SetTimeoutNs(const timestamp_t& time) { + std::lock_guard lck(mutex_); + timeout_ns_ = time; + if (instance_ != NULL) instance_->timeout_ = instance_->timer_->ns_to_sysclock(time); + } + private: // System agents iterating callback static hsa_status_t GetHsaAgentsCallback(hsa_agent_t agent, void* data); @@ -278,6 +334,18 @@ class HsaRsrcFactory { // Loader API table hsa_ven_amd_loader_1_00_pfn_t loader_api_; + + // System timeout, ns + static timestamp_t timeout_ns_; + // System timeout, sysclock + timestamp_t timeout_; + + // HSA timer + HsaTimer* timer_; + + // CPU/kern-arg memory pools + hsa_amd_memory_pool_t *cpu_pool_; + hsa_amd_memory_pool_t *kern_arg_pool_; }; From f344120fcce98120a9b488c8dc60273b74e33980 Mon Sep 17 00:00:00 2001 From: Evgeny Date: Mon, 20 Aug 2018 00:48:43 -0500 Subject: [PATCH 005/153] metrics descriptions --- test/tool/gfx_metrics.xml | 100 +++++++++++++++++++------------------- test/tool/metrics.xml | 5 +- 2 files changed, 52 insertions(+), 53 deletions(-) diff --git a/test/tool/gfx_metrics.xml b/test/tool/gfx_metrics.xml index 899ca85e..9e4f24fc 100644 --- a/test/tool/gfx_metrics.xml +++ b/test/tool/gfx_metrics.xml @@ -1,31 +1,31 @@ - - + + - - - - - - - - - - + + + + + + + + + + - - - - - + + + + + - - - + + + - - - + + + @@ -33,37 +33,37 @@ - - + + - - - - - - - - - - + + + + + + + + + + - - - - - + + + + + - - - + + + - - - - - - - + + + + + + + diff --git a/test/tool/metrics.xml b/test/tool/metrics.xml index a346eee9..6ee5c1d6 100644 --- a/test/tool/metrics.xml +++ b/test/tool/metrics.xml @@ -47,15 +47,14 @@ - # GPUBusy, percentage - # The percentage of time GPU was busy. + # GPUBusy The percentage of time GPU was busy. - # Wavefronts Total wavefronts., + # Wavefronts Total wavefronts. Date: Mon, 20 Aug 2018 02:53:16 -0500 Subject: [PATCH 006/153] install prefix --- cmake_modules/env.cmake | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/cmake_modules/env.cmake b/cmake_modules/env.cmake index a71acb66..47d404a2 100644 --- a/cmake_modules/env.cmake +++ b/cmake_modules/env.cmake @@ -118,15 +118,20 @@ get_filename_component ( HSA_RUNTIME_LIB_PATH ${HSA_RUNTIME_LIB} DIRECTORY ) find_library ( HSA_KMT_LIB "libhsakmt.so" ) get_filename_component ( HSA_KMT_LIB_PATH ${HSA_KMT_LIB} DIRECTORY ) -set ( API_PATH ${HSA_RUNTIME_INC_PATH} ) +## Install directory +if ( DEFINED ENV{CMAKE_INSTALL_DIR} ) + set ( CMAKE_INSTALL_PREFIX $ENV{CMAKE_INSTALL_DIR} ) +else () + set ( CMAKE_INSTALL_PREFIX "/opt/rocm" ) +endif () ## Basic Tool Chain Information -message ( "----------------NBIT: ${NBIT}" ) -message ( "-----------BuildType: ${CMAKE_BUILD_TYPE}" ) +message ( "----------------NBit: ${NBIT}" ) +message ( "----------Build-Type: ${CMAKE_BUILD_TYPE}" ) message ( "------------Compiler: ${CMAKE_CXX_COMPILER}" ) message ( "----Compiler-Version: ${CMAKE_CXX_COMPILER_VERSION}" ) message ( "-----HSA-Runtime-Inc: ${HSA_RUNTIME_INC_PATH}" ) message ( "-----HSA-Runtime-Lib: ${HSA_RUNTIME_LIB_PATH}" ) -message ( "------------API-path: ${API_PATH}" ) -message ( "-----CMAKE_CXX_FLAGS: ${CMAKE_CXX_FLAGS}" ) +message ( "-----------CXX-Flags: ${CMAKE_CXX_FLAGS}" ) message ( "---CMAKE_PREFIX_PATH: ${CMAKE_PREFIX_PATH}" ) +message ( "---------Install-Dir: ${CMAKE_INSTALL_PREFIX}" ) From d1e594d468621ca702d460d17cdcc1a485e76487 Mon Sep 17 00:00:00 2001 From: eshcherb <33529668+eshcherb@users.noreply.github.com> Date: Wed, 22 Aug 2018 11:17:12 -0500 Subject: [PATCH 007/153] Update README.md --- README.md | 77 ++++++++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 76 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 3ac63141..18be5e38 100644 --- a/README.md +++ b/README.md @@ -5,7 +5,9 @@ ROC profiler library. Profiling with perf-counters and derived metrics. Library HW specific low-level performance analysis interface for profiling of GPU compute applications. The profiling includes HW performance counters with complex performance metrics and HW traces The library source tree: - - doc - Documentation + - bin + - rpl_run.sh - profiling utility + - doc - Documentation - inc/rocprofiler.h - Library public API - src - Library sources - core - Library API sources @@ -30,6 +32,9 @@ The library source tree: cd build cmake -DCMAKE_PREFIX_PATH=/opt/rocm/lib:/opt/rocm/include/hsa .. make + + For ROCM under 1.9 need: + export CMAKE_CURR_API=1 ``` ## To run the test: @@ -59,3 +64,73 @@ The library source tree: ``` export ROCPROFILER_TRACE=1 ``` + +## Profiling utility usage: + + rpl_run.sh [-h] [--list-basic] [--list-derived] [-i ] [-o ] + +Options: + -h - this help + --verbose - verbose mode, dumping all base counters used in the input metrics + --list-basic - to print the list of basic HW counters + --list-derived - to print the list of derived metrics with formulas + + -i <.txt|.xml file> - input file + Input file .txt format, automatically rerun application for every pmc/sqtt line: + + # Perf counters group 1 + pmc : Wavefronts VALUInsts SALUInsts SFetchInsts FlatVMemInsts LDSInsts FlatLDSInsts GDSInsts VALUUtilization FetchSize + # Perf counters group 2 + pmc : WriteSize L2CacheHit + # Filter by dispatches range, GPU index and kernel names + # supported range formats: "3:9", "3:", "3" + range: 1 : 4 + gpu: 0 1 2 3 + kernel: simple Pass1 simpleConvolutionPass2 + + Input file .xml format, for single profiling run: + + # Metrics list definition, also the form ":" can be used + # All defined metrics can be found in the 'metrics.xml' + # There are basic metrics for raw HW counters and high-level metrics for derived counters + + + # Filter by dispatches range, GPU index and kernel names + + + -o - output CSV file [.csv] + -d - directory where profiler store profiling data including thread treaces [/tmp] + The data directory is renoving autonatically if the directory is matching the temporary one, which is the default. + -t - to change the temporary directory [/tmp] + By changing the temporary directory you can prevent removing the profiling data from /tmp or enable removing from not '/tmp' directory. + + --basenames - to turn on/off truncating of the kernel full function names till the base ones [off] + --timestamp - to turn on/off the kernel disoatches timestamps, dispatch/begin/end/complete [off] + --ctx-limit - maximum number of outstanding contexts [0 - unlimited] + --heartbeat - to print progress heartbeats [0 - disabled] + --sqtt-size - to set SQTT buffer size, aggregate for all SE [0x2000000] + Can be set in KB (1024B) or MB (1048576) units, examples 20K or 20M respectively. + --sqtt-local - to allocate SQTT buffer in local GPU memory [on] + +Configuration file: + You can set your parameters defaults preferences in the configuration file 'rpl_rc.xml'. The search path sequence: .:/home/evgeny: + First the configuration file is looking in the current directory, then in your home, and then in the package directory. + Configurable options: 'basenames', 'timestamp', 'ctx-limit', 'heartbeat', 'sqtt-size', 'sqtt-local'. + An example of 'rpl_rc.xml': + + From 0921b34f0a53a20d7f0e77a8f0c5d404618c9f49 Mon Sep 17 00:00:00 2001 From: eshcherb <33529668+eshcherb@users.noreply.github.com> Date: Wed, 22 Aug 2018 11:18:23 -0500 Subject: [PATCH 008/153] Update README.md --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 18be5e38..a0f9f440 100644 --- a/README.md +++ b/README.md @@ -66,7 +66,7 @@ The library source tree: ``` ## Profiling utility usage: - +``` rpl_run.sh [-h] [--list-basic] [--list-derived] [-i ] [-o ] Options: @@ -133,4 +133,4 @@ Configuration file: sqtt-size=0x20M sqtt-local=on > - +``` From 1cf808b55c3d339392ea7fc9b8b70b9cdceb165a Mon Sep 17 00:00:00 2001 From: eshcherb <33529668+eshcherb@users.noreply.github.com> Date: Wed, 22 Aug 2018 11:27:33 -0500 Subject: [PATCH 009/153] Update README.md --- README.md | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index a0f9f440..ae1d889e 100644 --- a/README.md +++ b/README.md @@ -6,7 +6,7 @@ HW specific low-level performance analysis interface for profiling of GPU comput The library source tree: - bin - - rpl_run.sh - profiling utility + - rpl_run.sh - Profiling tool run script - doc - Documentation - inc/rocprofiler.h - Library public API - src - Library sources @@ -14,6 +14,9 @@ The library source tree: - util - Library utils sources - xml - XML parser - test - Library test suite + - tool - Profiling tool + - tool.cpp - tool sources + - metrics.xml - metrics config file - ctrl - Test controll - util - Test utils - simple_convolution - Simple convolution test kernel @@ -31,7 +34,6 @@ The library source tree: mkdir build cd build cmake -DCMAKE_PREFIX_PATH=/opt/rocm/lib:/opt/rocm/include/hsa .. - make For ROCM under 1.9 need: export CMAKE_CURR_API=1 From aae99a8ba70297f2f8c0724dd882b542664fc060 Mon Sep 17 00:00:00 2001 From: eshcherb <33529668+eshcherb@users.noreply.github.com> Date: Wed, 22 Aug 2018 11:28:47 -0500 Subject: [PATCH 010/153] Update README.md --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index ae1d889e..40b5e4f5 100644 --- a/README.md +++ b/README.md @@ -34,6 +34,7 @@ The library source tree: mkdir build cd build cmake -DCMAKE_PREFIX_PATH=/opt/rocm/lib:/opt/rocm/include/hsa .. + make For ROCM under 1.9 need: export CMAKE_CURR_API=1 From d763aa8aaffe84ab23653b97e1a7fb17af8654d4 Mon Sep 17 00:00:00 2001 From: Evgeny Date: Wed, 22 Aug 2018 11:34:31 -0500 Subject: [PATCH 011/153] install dir --- cmake_modules/env.cmake | 7 ------- 1 file changed, 7 deletions(-) diff --git a/cmake_modules/env.cmake b/cmake_modules/env.cmake index 47d404a2..37223b6a 100644 --- a/cmake_modules/env.cmake +++ b/cmake_modules/env.cmake @@ -118,13 +118,6 @@ get_filename_component ( HSA_RUNTIME_LIB_PATH ${HSA_RUNTIME_LIB} DIRECTORY ) find_library ( HSA_KMT_LIB "libhsakmt.so" ) get_filename_component ( HSA_KMT_LIB_PATH ${HSA_KMT_LIB} DIRECTORY ) -## Install directory -if ( DEFINED ENV{CMAKE_INSTALL_DIR} ) - set ( CMAKE_INSTALL_PREFIX $ENV{CMAKE_INSTALL_DIR} ) -else () - set ( CMAKE_INSTALL_PREFIX "/opt/rocm" ) -endif () - ## Basic Tool Chain Information message ( "----------------NBit: ${NBIT}" ) message ( "----------Build-Type: ${CMAKE_BUILD_TYPE}" ) From 4ca55841853dea4fd3abca54a106fba204bbc0c3 Mon Sep 17 00:00:00 2001 From: eshcherb <33529668+eshcherb@users.noreply.github.com> Date: Wed, 22 Aug 2018 11:35:39 -0500 Subject: [PATCH 012/153] Update README.md --- README.md | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 40b5e4f5..20be662f 100644 --- a/README.md +++ b/README.md @@ -30,11 +30,13 @@ The library source tree: ## To build with the current installed ROCM: ``` + - To build and install to /opt/rocm/rocprofiler cd .../rocprofiler mkdir build cd build - cmake -DCMAKE_PREFIX_PATH=/opt/rocm/lib:/opt/rocm/include/hsa .. + cmake -DCMAKE_PREFIX_PATH=/opt/rocm/lib:/opt/rocm/include/hsa -DCMAKE_INSTALL_PREFIX=/opt/rocm .. make + make install For ROCM under 1.9 need: export CMAKE_CURR_API=1 From bdb98b64187969865fbf045bac5ed7a0c41b9af7 Mon Sep 17 00:00:00 2001 From: Evgeny Date: Wed, 22 Aug 2018 12:02:47 -0500 Subject: [PATCH 013/153] rocm include dir search fix --- cmake_modules/env.cmake | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/cmake_modules/env.cmake b/cmake_modules/env.cmake index 37223b6a..68209386 100644 --- a/cmake_modules/env.cmake +++ b/cmake_modules/env.cmake @@ -107,9 +107,9 @@ elseif ( ${CMAKE_SYSTEM_PROCESSOR} STREQUAL "x86" ) endif () ## Find hsa-runtime headers/lib -find_file ( HSA_RUNTIME_INC "hsa.h" ) -if ( "${HSA_RUNTIME_INC_PATH}" STREQUAL "" ) - find_file ( HSA_RUNTIME_INC "hsa/hsa.h" ) +find_file ( HSA_RUNTIME_INC "hsa/hsa.h" ) +if ( "${HSA_RUNTIME_INC}" STREQUAL "" ) + find_file ( HSA_RUNTIME_INC "hsa.h" ) endif() find_library ( HSA_RUNTIME_LIB "libhsa-runtime${NBIT}.so" ) get_filename_component ( HSA_RUNTIME_INC_PATH ${HSA_RUNTIME_INC} DIRECTORY ) From 3039a5162921530a080d7048bc50cfcfc3a09710 Mon Sep 17 00:00:00 2001 From: Evgeny Date: Sun, 26 Aug 2018 11:25:33 -0500 Subject: [PATCH 014/153] enable timestamps only --- CMakeLists.txt | 18 +++++++++++------- cmake_modules/env.cmake | 7 +++---- src/core/context.h | 7 ++++++- src/core/intercept_queue.h | 1 - test/tool/tool.cpp | 38 ++++++++++++++++++++++++++++---------- 5 files changed, 48 insertions(+), 23 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 92bc348f..c8d473d7 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -74,23 +74,28 @@ endif () add_subdirectory ( ${TEST_DIR} ${PROJECT_BINARY_DIR}/test ) ## Install information -install ( TARGETS ${ROCPROFILER_TARGET} LIBRARY DESTINATION ${ROCPROFILER_NAME}/lib ) -install ( FILES ${CMAKE_CURRENT_SOURCE_DIR}/inc/rocprofiler.h DESTINATION ${ROCPROFILER_NAME}/include ) +#if ( CMAKE_INSTALL_PREFIX_INITIALIZED_TO_DEFAULT ) +#message ( "CMAKE default prefix: ${CMAKE_INSTALL_PREFIX}" ) +#endif () +set ( CMAKE_INSTALL_PREFIX "${CMAKE_INSTALL_PREFIX}/${ROCPROFILER_NAME}" ) +message ( "---------Install-Dir: ${CMAKE_INSTALL_PREFIX}" ) +install ( TARGETS ${ROCPROFILER_TARGET} LIBRARY DESTINATION lib ) +install ( FILES ${CMAKE_CURRENT_SOURCE_DIR}/inc/rocprofiler.h DESTINATION include ) # rpl_run.sh tblextr.py txt2xml.sh install ( FILES ${CMAKE_CURRENT_SOURCE_DIR}/bin/rpl_run.sh ${CMAKE_CURRENT_SOURCE_DIR}/bin/txt2xml.sh ${CMAKE_CURRENT_SOURCE_DIR}/bin/tblextr.py - DESTINATION ${ROCPROFILER_NAME}/bin + DESTINATION bin PERMISSIONS OWNER_READ OWNER_EXECUTE GROUP_READ GROUP_EXECUTE WORLD_READ WORLD_EXECUTE ) # gfx_metrics.xml metrics.xml install ( FILES ${CMAKE_CURRENT_SOURCE_DIR}/test/tool/metrics.xml ${CMAKE_CURRENT_SOURCE_DIR}/test/tool/gfx_metrics.xml - DESTINATION ${ROCPROFILER_NAME}/lib ) + DESTINATION lib ) # libtool.so -install ( FILES ${PROJECT_BINARY_DIR}/test/libtool.so DESTINATION ${ROCPROFILER_NAME}/tool ) -install ( FILES ${PROJECT_BINARY_DIR}/test/ctrl DESTINATION ${ROCPROFILER_NAME}/tool +install ( FILES ${PROJECT_BINARY_DIR}/test/libtool.so DESTINATION tool ) +install ( FILES ${PROJECT_BINARY_DIR}/test/ctrl DESTINATION tool PERMISSIONS OWNER_READ OWNER_EXECUTE GROUP_READ GROUP_EXECUTE WORLD_READ WORLD_EXECUTE ) ## Packaging directives @@ -105,7 +110,6 @@ set ( CPACK_RESOURCE_FILE_LICENSE "${CMAKE_CURRENT_SOURCE_DIR}/LICENSE" ) ## Debian package specific variables set ( CPACK_DEBIAN_PACKAGE_DEPENDS "hsa-rocr-dev" ) -set ( CPACK_DEBIAN_PACKAGE_HOMEPAGE "https://github.com/RadeonOpenCompute/HSA-RocProfiler" ) set ( CPACK_DEBIAN_PACKAGE_CONTROL_EXTRA "${CMAKE_CURRENT_SOURCE_DIR}/DEBIAN/postinst;${CMAKE_CURRENT_SOURCE_DIR}/DEBIAN/prerm" ) ## RPM package specific variables diff --git a/cmake_modules/env.cmake b/cmake_modules/env.cmake index 68209386..6bf6ed45 100644 --- a/cmake_modules/env.cmake +++ b/cmake_modules/env.cmake @@ -107,9 +107,9 @@ elseif ( ${CMAKE_SYSTEM_PROCESSOR} STREQUAL "x86" ) endif () ## Find hsa-runtime headers/lib -find_file ( HSA_RUNTIME_INC "hsa/hsa.h" ) -if ( "${HSA_RUNTIME_INC}" STREQUAL "" ) - find_file ( HSA_RUNTIME_INC "hsa.h" ) +find_file ( HSA_RUNTIME_INC "hsa.h" ) +if ( "${HSA_RUNTIME_INC_PATH}" STREQUAL "" ) + find_file ( HSA_RUNTIME_INC "hsa/hsa.h" ) endif() find_library ( HSA_RUNTIME_LIB "libhsa-runtime${NBIT}.so" ) get_filename_component ( HSA_RUNTIME_INC_PATH ${HSA_RUNTIME_INC} DIRECTORY ) @@ -127,4 +127,3 @@ message ( "-----HSA-Runtime-Inc: ${HSA_RUNTIME_INC_PATH}" ) message ( "-----HSA-Runtime-Lib: ${HSA_RUNTIME_LIB_PATH}" ) message ( "-----------CXX-Flags: ${CMAKE_CXX_FLAGS}" ) message ( "---CMAKE_PREFIX_PATH: ${CMAKE_PREFIX_PATH}" ) -message ( "---------Install-Dir: ${CMAKE_INSTALL_PREFIX}" ) diff --git a/src/core/context.h b/src/core/context.h index f7ad792d..6eb391a8 100644 --- a/src/core/context.h +++ b/src/core/context.h @@ -160,13 +160,18 @@ class Context { queue_(queue), hsa_rsrc_(&util::HsaRsrcFactory::Instance()), api_(hsa_rsrc_->AqlProfileApi()), + metrics_(NULL), handler_(handler), handler_arg_(handler_arg) { - if (info_count == 0) return; + if (info_count == 0) { + set_.push_back(Group(agent_info_, this, 0)); + return; + } metrics_ = MetricsDict::Create(agent_info); if (metrics_ == NULL) EXC_RAISING(HSA_STATUS_ERROR, "MetricsDict create failed"); + if (Initialize(info, info_count) == false) { fprintf(stdout, "\nInput metrics out of HW limit. Proposed metrics group set:\n"); fflush(stdout); MetricsGroupSet(agent_info, info, info_count).Print(stdout); diff --git a/src/core/intercept_queue.h b/src/core/intercept_queue.h index 627a718a..c99e51dc 100644 --- a/src/core/intercept_queue.h +++ b/src/core/intercept_queue.h @@ -181,7 +181,6 @@ class InterceptQueue { void* context_handler_arg = NULL; rocprofiler_handler_t context_handler_fun = context->GetHandler(&context_handler_arg); tracker_->Enable(tracker_entry, context_handler_fun, context_handler_arg); - rocprofiler_close(context); } } } diff --git a/test/tool/tool.cpp b/test/tool/tool.cpp index 65385078..373f1f7b 100644 --- a/test/tool/tool.cpp +++ b/test/tool/tool.cpp @@ -144,6 +144,13 @@ void check_status(hsa_status_t status) { } } +// Print profiling results output break if terminal output is enabled +void results_output_break() { + const bool is_terminal_output = (result_file_opened == false); + if (is_terminal_output) printf("\nROCprofiler results:\n"); +} + +// Filtering kernel name std::string filtr_kernel_name(const std::string name) { auto rit = name.rbegin(); auto rend = name.rend(); @@ -179,6 +186,7 @@ std::string filtr_kernel_name(const std::string name) { return name.substr(pos, length); } +// Inflight submits monitoring thread void* monitor_thr_fun(void*) { while (context_array != NULL) { sleep(CTX_OUTSTANDING_MON); @@ -198,6 +206,7 @@ void* monitor_thr_fun(void*) { return NULL; } +// Increment profiling context counter value uint32_t next_context_count() { if (pthread_mutex_lock(&mutex) != 0) { perror("pthread_mutex_lock"); @@ -423,12 +432,14 @@ bool dump_context_entry(context_entry_t* entry) { rocprofiler_group_t& group = entry->group; if (group.context != NULL) { - status = rocprofiler_group_get_data(&group); - check_status(status); - if (verbose == 1) output_group(entry, "group0-data"); + if (entry->feature_count > 0) { + status = rocprofiler_group_get_data(&group); + check_status(status); + if (verbose == 1) output_group(entry, "group0-data"); - status = rocprofiler_get_metrics(group.context); - check_status(status); + status = rocprofiler_get_metrics(group.context); + check_status(status); + } std::ostringstream oss; oss << index << "__" << filtr_kernel_name(entry->data.kernel_name); output_results(entry, oss.str().substr(0, KERNEL_NAME_LEN_MAX).c_str()); @@ -623,7 +634,7 @@ hsa_status_t dispatch_callback(const rocprofiler_callback_data_t* callback_data, } hsa_status_t destroy_callback(hsa_queue_t* queue, void*) { - if (result_file_opened == false) printf("\nROCProfiler results:\n"); + results_output_break(); dump_context_array(queue); return HSA_STATUS_SUCCESS; } @@ -1006,13 +1017,20 @@ extern "C" PUBLIC_API void OnUnloadTool() { rocprofiler_remove_queue_callbacks(); // Dump stored profiling output data - printf("\nROCPRofiler: %u contexts collected", context_collected); fflush(stdout); - dump_context_array(NULL); + fflush(stdout); if (result_file_opened) { + printf("\nROCPRofiler: %u contexts collected", context_collected); fflush(stdout); + dump_context_array(NULL); fclose(result_file_handle); - printf(", output directory %s", result_prefix); + printf(", output directory %s\n", result_prefix); + } else { + if (context_collected != context_count) { + results_output_break(); + dump_context_array(NULL); + } + printf("\nROCPRofiler: %u contexts collected\n", context_collected); } - printf("\n"); fflush(stdout); + fflush(stdout); // Cleanup if (callbacks_data != NULL) { From ce893db802dd840b98591950eaddabeb625eaa31 Mon Sep 17 00:00:00 2001 From: eshcherb <33529668+eshcherb@users.noreply.github.com> Date: Mon, 17 Sep 2018 12:57:30 -0500 Subject: [PATCH 015/153] Update README.md --- README.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 20be662f..8af4d76e 100644 --- a/README.md +++ b/README.md @@ -34,7 +34,8 @@ The library source tree: cd .../rocprofiler mkdir build cd build - cmake -DCMAKE_PREFIX_PATH=/opt/rocm/lib:/opt/rocm/include/hsa -DCMAKE_INSTALL_PREFIX=/opt/rocm .. + export CMAKE_PREFIX_PATH=/opt/rocm/lib:/opt/rocm/include/hsa + cmake -DCMAKE_INSTALL_PREFIX=/opt/rocm .. make make install From 894eba54bec3ed74338d340ce5526b16393525a3 Mon Sep 17 00:00:00 2001 From: eshcherb <33529668+eshcherb@users.noreply.github.com> Date: Thu, 20 Sep 2018 16:16:29 -0500 Subject: [PATCH 016/153] Update README.md --- README.md | 12 ------------ 1 file changed, 12 deletions(-) diff --git a/README.md b/README.md index 8af4d76e..41a5934e 100644 --- a/README.md +++ b/README.md @@ -43,18 +43,6 @@ The library source tree: export CMAKE_CURR_API=1 ``` -## To run the test: -``` - cd .../rocprofiler/build - export LD_LIBRARY_PATH=.: # paths to ROC profiler and oher libraries - export HSA_TOOLS_LIB=librocprofiler64.so # ROC profiler library loaded by HSA runtime - export ROCP_TOOL_LIB=test/libtool.so # tool library loaded by ROC profiler - export ROCP_METRICS=metrics.xml # ROC profiler metrics config file - export ROCP_INPUT=input.xml # input file for the tool library - export ROCP_OUTPUT_DIR=./ # output directory for the tool library, for metrics results file 'results.txt' - -``` - ## Internal 'simple_convolution' test run script: ``` cd .../rocprofiler/build From 8591164e8caf654ae0acead6224ba44ad3b608c9 Mon Sep 17 00:00:00 2001 From: Evgeny Date: Thu, 4 Oct 2018 15:23:39 -0500 Subject: [PATCH 017/153] fix reading of the input counters line without end-of-line symboll --- bin/rpl_run.sh | 3 ++- bin/tblextr.py | 5 +++-- bin/txt2xml.sh | 5 +++-- test/run.sh | 3 ++- 4 files changed, 10 insertions(+), 6 deletions(-) diff --git a/bin/rpl_run.sh b/bin/rpl_run.sh index 64185761..cfa38832 100755 --- a/bin/rpl_run.sh +++ b/bin/rpl_run.sh @@ -1,3 +1,5 @@ +#!/bin/sh + ################################################################################ # Copyright (c) 2018 Advanced Micro Devices, Inc. All rights reserved. # @@ -20,7 +22,6 @@ # THE SOFTWARE. ################################################################################ -#!/bin/sh time_stamp=`date +%y%m%d_%H%M%S` BIN_DIR=`dirname $0` BIN_DIR=`cd $BIN_DIR; pwd` diff --git a/bin/tblextr.py b/bin/tblextr.py index 630417ce..6a0f8eb2 100755 --- a/bin/tblextr.py +++ b/bin/tblextr.py @@ -1,3 +1,5 @@ +#!/usr/bin/python + ################################################################################ # Copyright (c) 2018 Advanced Micro Devices, Inc. All rights reserved. # @@ -20,7 +22,6 @@ # THE SOFTWARE. ################################################################################ -#!/usr/bin/python import os, sys, re # Parsing results in the format: @@ -114,7 +115,7 @@ def print_tbl(outfile): outfile = sys.argv[1] infiles = sys.argv[2:] -for f in infiles : +for f in infiles: parse_res(f) ret = print_tbl(outfile) sys.exit(ret) diff --git a/bin/txt2xml.sh b/bin/txt2xml.sh index 9881160d..66da77db 100755 --- a/bin/txt2xml.sh +++ b/bin/txt2xml.sh @@ -1,3 +1,5 @@ +#!/bin/bash + ################################################################################ # Copyright (c) 2018 Advanced Micro Devices, Inc. All rights reserved. # @@ -20,7 +22,6 @@ # THE SOFTWARE. ################################################################################ -#!/bin/bash timestamp=`date +%y%m%d_%H%M%S` if [ $# = 0 ] ; then @@ -41,7 +42,7 @@ gpu_index="" parse() { scan="$1" index=0 - while read -r line ; do + while read -r line || [[ -n "$line" ]] ; do line=`echo $line | sed "s/\s*#.*$//"` if [ -z "$line" ] ; then continue diff --git a/test/run.sh b/test/run.sh index 3ac292e6..a189d18a 100755 --- a/test/run.sh +++ b/test/run.sh @@ -1,3 +1,5 @@ +#!/bin/sh + ################################################################################ # Copyright (c) 2018 Advanced Micro Devices, Inc. All rights reserved. # @@ -19,7 +21,6 @@ # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN # THE SOFTWARE. ################################################################################ -#!/bin/sh test_bin_dflt=./test/ctrl From a37902057d86b4a010f0811a1c358fdeaa20c767 Mon Sep 17 00:00:00 2001 From: eshcherb <33529668+eshcherb@users.noreply.github.com> Date: Fri, 12 Oct 2018 15:30:14 -0500 Subject: [PATCH 018/153] Update README.md --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 41a5934e..07ee7b63 100644 --- a/README.md +++ b/README.md @@ -73,9 +73,9 @@ Options: Input file .txt format, automatically rerun application for every pmc/sqtt line: # Perf counters group 1 - pmc : Wavefronts VALUInsts SALUInsts SFetchInsts FlatVMemInsts LDSInsts FlatLDSInsts GDSInsts VALUUtilization FetchSize + pmc : Wavefronts VALUInsts SALUInsts SFetchInsts FlatVMemInsts LDSInsts FlatLDSInsts GDSInsts FetchSize # Perf counters group 2 - pmc : WriteSize L2CacheHit + pmc : VALUUtilization,WriteSize L2CacheHit # Filter by dispatches range, GPU index and kernel names # supported range formats: "3:9", "3:", "3" range: 1 : 4 From 0fe0ffdcd1e0f70e9e71d38a5be89e1e1d825042 Mon Sep 17 00:00:00 2001 From: eshcherb <33529668+eshcherb@users.noreply.github.com> Date: Fri, 12 Oct 2018 15:34:38 -0500 Subject: [PATCH 019/153] Update README.md --- README.md | 3 --- 1 file changed, 3 deletions(-) diff --git a/README.md b/README.md index 07ee7b63..6f02bca0 100644 --- a/README.md +++ b/README.md @@ -38,9 +38,6 @@ The library source tree: cmake -DCMAKE_INSTALL_PREFIX=/opt/rocm .. make make install - - For ROCM under 1.9 need: - export CMAKE_CURR_API=1 ``` ## Internal 'simple_convolution' test run script: From 5ac3c576bfa39794700629e7d8f1165cbdf6f7a5 Mon Sep 17 00:00:00 2001 From: Rene van Oostrum Date: Mon, 15 Oct 2018 10:46:00 -0500 Subject: [PATCH 020/153] Change sorting so that it not only works with Python 2.7, but with 3.x too --- script/tblextr.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/script/tblextr.py b/script/tblextr.py index 9a314db4..f6a37dc0 100755 --- a/script/tblextr.py +++ b/script/tblextr.py @@ -84,8 +84,7 @@ def print_tbl(outfile): out = open(outfile, 'w') - keys = var_table.keys() - keys.sort(key=int) + keys = sorted(var_table.keys(), key=int) entry = var_table[keys[0]] list1 = [] From df0ce581560f39faee91f8082effd62babfcc172 Mon Sep 17 00:00:00 2001 From: Evgeny Date: Fri, 16 Nov 2018 19:26:53 -0600 Subject: [PATCH 021/153] tracker handlers ordering --- bin/rpl_run.sh | 52 ++--- inc/rocprofiler.h | 19 +- src/core/hsa_queue.h | 9 - src/core/intercept_queue.cpp | 4 +- src/core/intercept_queue.h | 47 ++++- src/core/metrics.h | 2 +- src/core/rocprofiler.cpp | 114 ++++++++++- src/core/tracker.h | 53 ++++-- src/core/types.h | 2 +- src/util/hsa_rsrc_factory.cpp | 9 +- src/util/hsa_rsrc_factory.h | 15 +- test/CMakeLists.txt | 34 +++- test/app/intercept_test.cpp | 231 +++++++++++++++++++++++ test/app/standalone_test.cpp | 163 ++++++++++++++++ test/app/test.cpp | 48 ++++- test/ctrl/run_kernel.h | 13 +- test/ctrl/test_hsa.cpp | 63 ++++--- test/ctrl/test_hsa.h | 20 +- test/dummy_kernel/dummy_kernel.cl | 28 +++ test/dummy_kernel/dummy_kernel.h | 71 +++++++ test/dummy_kernel/gfx8_DummyKernel.hsaco | Bin 0 -> 10952 bytes test/dummy_kernel/gfx9_DummyKernel.hsaco | Bin 0 -> 10952 bytes test/run.sh | 44 +++-- test/tool/metrics.xml | 70 +++---- test/tool/tool.cpp | 7 +- test/util/hsa_rsrc_factory.cpp | 9 +- test/util/hsa_rsrc_factory.h | 15 +- 27 files changed, 939 insertions(+), 203 deletions(-) create mode 100644 test/app/intercept_test.cpp create mode 100644 test/app/standalone_test.cpp create mode 100644 test/dummy_kernel/dummy_kernel.cl create mode 100644 test/dummy_kernel/dummy_kernel.h create mode 100755 test/dummy_kernel/gfx8_DummyKernel.hsaco create mode 100755 test/dummy_kernel/gfx9_DummyKernel.hsaco diff --git a/bin/rpl_run.sh b/bin/rpl_run.sh index cfa38832..043c0007 100755 --- a/bin/rpl_run.sh +++ b/bin/rpl_run.sh @@ -82,7 +82,7 @@ usage() { echo "Metrics definition: $PKG_DIR/lib/metrics.xml" echo "" echo "Usage:" - echo " rpl_run.sh [-h] [--list-basic] [--list-derived] [-i ] [-o ] " + echo " $bin_name [-h] [--list-basic] [--list-derived] [-i ] [-o ] " echo "" echo "Options:" echo " -h - this help" @@ -91,7 +91,7 @@ usage() { echo " --list-derived - to print the list of derived metrics with formulas" echo "" echo " -i <.txt|.xml file> - input file" - echo " Input file .txt format, automatically rerun application for every pmc/sqtt line:" + echo " Input file .txt format, automatically rerun application for every pmc line:" echo "" echo " # Perf counters group 1" echo " pmc : Wavefronts VALUInsts SALUInsts SFetchInsts FlatVMemInsts LDSInsts FlatLDSInsts GDSInsts VALUUtilization FetchSize" @@ -131,22 +131,17 @@ usage() { echo " --timestamp - to turn on/off the kernel disoatches timestamps, dispatch/begin/end/complete [off]" echo " --ctx-limit - maximum number of outstanding contexts [0 - unlimited]" echo " --heartbeat - to print progress heartbeats [0 - disabled]" - echo " --sqtt-size - to set SQTT buffer size, aggregate for all SE [0x2000000]" - echo " Can be set in KB (1024B) or MB (1048576) units, examples 20K or 20M respectively." - echo " --sqtt-local - to allocate SQTT buffer in local GPU memory [on]" echo "" echo "Configuration file:" echo " You can set your parameters defaults preferences in the configuration file 'rpl_rc.xml'. The search path sequence: .:${HOME}:" echo " First the configuration file is looking in the current directory, then in your home, and then in the package directory." - echo " Configurable options: 'basenames', 'timestamp', 'ctx-limit', 'heartbeat', 'sqtt-size', 'sqtt-local'." + echo " Configurable options: 'basenames', 'timestamp', 'ctx-limit', 'heartbeat'." echo " An example of 'rpl_rc.xml':" echo " " echo "" exit 1 @@ -246,20 +241,6 @@ while [ 1 ] ; do export ROCP_OUTSTANDING_MAX="$2" elif [ "$1" = "--heartbeat" ] ; then export ROCP_OUTSTANDING_MON="$2" - elif [ "$1" = "--sqtt-size" ] ; then - size_m=`echo "$2" | sed -n "s/^\(.*\)M$/\1/p"` - size_k=`echo "$2" | sed -n "s/^\(.*\)K$/\1/p"` - if [ -n "$size_m" ] ; then size_b=$((size_m*1024*1024)) - elif [ -n "$size_k" ] ; then size_b=$((size_k*1024)) - else size_b=$2 - fi - export ROCP_SQTT_SIZE=$size_b - elif [ "$1" = "--sqtt-local" ] ; then - if [ "$2" = "on" ] ; then - export ROCP_SQTT_LOCAL=1 - else - export ROCP_SQTT_LOCAL=0 - fi elif [ "$1" = "--verbose" ] ; then ARG_VAL=0 export ROCP_VERBOSE_MODE=1 @@ -276,15 +257,16 @@ if [ "$ARG_CK" = "-" ] ; then fi if [ -z "$INPUT_FILE" ] ; then - fatal "Need input file" -fi - -input_base=`echo "$INPUT_FILE" | sed "s/^\(.*\)\.\([^\.]*\)$/\1/"` -input_type=`echo "$INPUT_FILE" | sed "s/^\(.*\)\.\([^\.]*\)$/\2/"` -if [ -z "${input_base}" -o -z "${input_type}" ] ; then - fatal "Bad input file '$INPUT_FILE'" + input_base="results" + input_type="none" +else + input_base=`echo "$INPUT_FILE" | sed "s/^\(.*\)\.\([^\.]*\)$/\1/"` + input_type=`echo "$INPUT_FILE" | sed "s/^\(.*\)\.\([^\.]*\)$/\2/"` + if [ -z "${input_base}" -o -z "${input_type}" ] ; then + fatal "Bad input file '$INPUT_FILE'" + fi + input_base=`basename $input_base` fi -input_base=`basename $input_base` if [ "$OUTPUT_DIR" = "--" ] ; then fatal "Bad output dir '$OUTPUT_DIR'" @@ -309,7 +291,7 @@ input_list="" RES_DIR="" if [ "$input_type" = "xml" ] ; then input_list=$INPUT_FILE -elif [ "$input_type" = "txt" ] ; then +elif [ "$input_type" = "txt" -o "$input_type" = "none" ] ; then OUTPUT_DIR="-" RES_DIR=$DATA_PATH/$DATA_DIR if [ -e $RES_DIR ] ; then @@ -317,7 +299,11 @@ elif [ "$input_type" = "txt" ] ; then fi mkdir -p $RES_DIR echo "RPL: output dir '$RES_DIR'" - $BIN_DIR/txt2xml.sh $INPUT_FILE $RES_DIR + if [ "$input_type" = "txt" ] ; then + $BIN_DIR/txt2xml.sh $INPUT_FILE $RES_DIR + else + echo "" > $RES_DIR/input.xml + fi input_list=`/bin/ls $RES_DIR/input*.xml` export ROCPROFILER_SESS=$RES_DIR else @@ -341,6 +327,8 @@ if [ -n "$csv_output" ] ; then python $BIN_DIR/tblextr.py $csv_output $OUTPUT_LIST if [ "$?" -eq 0 ] ; then echo "RPL: '$csv_output' is generated" + else + echo "Data extracting error: $OUTPUT_LIST'" fi fi diff --git a/inc/rocprofiler.h b/inc/rocprofiler.h index 17106687..4448128f 100644 --- a/inc/rocprofiler.h +++ b/inc/rocprofiler.h @@ -45,7 +45,7 @@ THE SOFTWARE. #include #include -#define ROCPROFILER_VERSION_MAJOR 3 +#define ROCPROFILER_VERSION_MAJOR 5 #define ROCPROFILER_VERSION_MINOR 0 #ifdef __cplusplus @@ -216,21 +216,24 @@ typedef struct { uint32_t agent_index; // GPU index const hsa_queue_t* queue; // HSA queue uint64_t queue_index; // Index in the queue + uint32_t queue_id; // Queue id const hsa_kernel_dispatch_packet_t* packet; // HSA dispatch packet const char* kernel_name; // Kernel name + uint64_t kernel_object; // Kernel object pointer + int64_t thread_id; // Thread id const rocprofiler_dispatch_record_t* record; // Dispatch record } rocprofiler_callback_data_t; // Profiling callback type typedef hsa_status_t (*rocprofiler_callback_t)( - const rocprofiler_callback_data_t* callback_data, // [in] callback data union, data depends on - // the callback API id + const rocprofiler_callback_data_t* callback_data, // [in] callback data void* user_data, // [in/out] user data passed to the callback - rocprofiler_group_t* group); // [out] profiling group + rocprofiler_group_t* group); // [out] returned profiling group // Queue callbacks typedef struct { rocprofiler_callback_t dispatch; // dispatch callback + hsa_status_t (*create)(hsa_queue_t* queue, void* data); // create callback hsa_status_t (*destroy)(hsa_queue_t* queue, void* data); // destroy callback } rocprofiler_queue_callbacks_t; @@ -309,6 +312,8 @@ typedef enum { ROCPROFILER_INFO_KIND_METRIC_COUNT = 1, // metric features count, int32 ROCPROFILER_INFO_KIND_TRACE = 2, // trace info ROCPROFILER_INFO_KIND_TRACE_COUNT = 3, // trace features count, int32 + ROCPROFILER_INFO_KIND_TRACE_PARAMETER = 4, // trace parameter info + ROCPROFILER_INFO_KIND_TRACE_PARAMETER_COUNT = 5 // trace parameter count, int32 } rocprofiler_info_kind_t; // Profiling info query @@ -337,6 +342,12 @@ typedef struct { const char* description; // trace description uint32_t parameter_count; // supported by the trace number parameters } trace; + struct { + uint32_t code; // parameter code + const char* trace_name; // trace name + const char* parameter_name; // parameter name + const char* description; // trace parameter description + } trace_parameter; }; } rocprofiler_info_data_t; diff --git a/src/core/hsa_queue.h b/src/core/hsa_queue.h index 620f6224..12ef97bb 100644 --- a/src/core/hsa_queue.h +++ b/src/core/hsa_queue.h @@ -32,15 +32,6 @@ namespace rocprofiler { class HsaQueue : public Queue { public: - typedef void (HsaQueue::*submit_fptr_t)(const packet_t* packet); - enum { - LEGACY_SLOT_SIZE_W = HSA_VEN_AMD_AQLPROFILE_LEGACY_PM4_PACKET_SIZE / sizeof(packet_word_t), - LEGACY_SLOT_SIZE_P = HSA_VEN_AMD_AQLPROFILE_LEGACY_PM4_PACKET_SIZE / sizeof(packet_t) - }; - struct slot_pm4_t { - packet_word_t words[LEGACY_SLOT_SIZE_W]; - }; - HsaQueue(const util::AgentInfo* agent_info, hsa_queue_t* queue) : queue_(queue) {} void Submit(const packet_t* packet) { diff --git a/src/core/intercept_queue.cpp b/src/core/intercept_queue.cpp index 7703c662..91028f73 100644 --- a/src/core/intercept_queue.cpp +++ b/src/core/intercept_queue.cpp @@ -30,12 +30,14 @@ void InterceptQueue::HsaIntercept(HsaApiTable* table) { InterceptQueue::mutex_t InterceptQueue::mutex_; rocprofiler_callback_t InterceptQueue::dispatch_callback_ = NULL; +InterceptQueue::queue_callback_t InterceptQueue::create_callback_ = NULL; InterceptQueue::queue_callback_t InterceptQueue::destroy_callback_ = NULL; void* InterceptQueue::callback_data_ = NULL; InterceptQueue::obj_map_t* InterceptQueue::obj_map_ = NULL; const char* InterceptQueue::kernel_none_ = ""; Tracker* InterceptQueue::tracker_ = NULL; bool InterceptQueue::tracker_on_ = false; -bool InterceptQueue::in_constr_call_ = false; +bool InterceptQueue::in_create_call_ = false; +InterceptQueue::queue_id_t InterceptQueue::current_queue_id = 0; } // namespace rocprofiler diff --git a/src/core/intercept_queue.h b/src/core/intercept_queue.h index c99e51dc..1f31b0d9 100644 --- a/src/core/intercept_queue.h +++ b/src/core/intercept_queue.h @@ -26,6 +26,7 @@ THE SOFTWARE. #include #include #include +#include #include #include @@ -49,6 +50,7 @@ class InterceptQueue { typedef std::map obj_map_t; typedef hsa_status_t (*queue_callback_t)(hsa_queue_t*, void* data); typedef void (*queue_event_callback_t)(hsa_status_t status, hsa_queue_t *queue, void *arg); + typedef uint32_t queue_id_t; static void HsaIntercept(HsaApiTable* table); @@ -61,8 +63,8 @@ class InterceptQueue { std::lock_guard lck(mutex_); hsa_status_t status = HSA_STATUS_ERROR; - if (in_constr_call_) EXC_ABORT(status, "recursive InterceptQueueCreate()"); - in_constr_call_ = true; + if (in_create_call_) EXC_ABORT(status, "recursive InterceptQueueCreate()"); + in_create_call_ = true; ProxyQueue* proxy = ProxyQueue::Create(agent, size, type, queue_event_callback, data, private_segment_size, group_segment_size, queue, &status); @@ -79,8 +81,14 @@ class InterceptQueue { (*obj_map_)[(uint64_t)(*queue)] = obj; status = proxy->SetInterceptCB(OnSubmitCB, obj); obj->queue_event_callback_ = callback; + obj->queue_id = current_queue_id; + ++current_queue_id; - in_constr_call_ = false; + if (create_callback_ != NULL) { + status = create_callback_(*queue, callback_data_); + } + + in_create_call_ = false; return status; } @@ -139,13 +147,17 @@ class InterceptQueue { } // Prepareing dispatch callback data - const char* kernel_name = GetKernelName(dispatch_packet); + uint64_t kernel_symbol = GetKernelSymbol(dispatch_packet); + const char* kernel_name = GetKernelName(kernel_symbol); rocprofiler_callback_data_t data = {obj->agent_info_->dev_id, obj->agent_info_->dev_index, obj->queue_, user_que_idx, + obj->queue_id, dispatch_packet, kernel_name, + kernel_symbol, + syscall(__NR_gettid), (tracker_entry) ? tracker_entry->record : NULL}; // Calling dispatch callback @@ -154,7 +166,10 @@ class InterceptQueue { free(const_cast(kernel_name)); // Injecting profiling start/stop packets if ((status != HSA_STATUS_SUCCESS) || (group.context == NULL)) { - if (tracker_entry != NULL) tracker_->Delete(tracker_entry); + if (tracker_entry != NULL) { + const_cast(dispatch_packet)->completion_signal = tracker_entry->orig; + tracker_->Delete(tracker_entry); + } } else { Context* context = reinterpret_cast(group.context); @@ -197,10 +212,15 @@ class InterceptQueue { } } - static void SetCallbacks(rocprofiler_callback_t dispatch_callback, queue_callback_t destroy_callback, void* data) { + static void SetCallbacks(rocprofiler_callback_t dispatch_callback, + queue_callback_t create_callback, + queue_callback_t destroy_callback, + void* data) + { std::lock_guard lck(mutex_); callback_data_ = data; dispatch_callback_ = dispatch_callback; + create_callback_ = create_callback; destroy_callback_ = destroy_callback; } @@ -219,7 +239,7 @@ class InterceptQueue { return static_cast((*header >> HSA_PACKET_HEADER_TYPE) & header_type_mask); } - static const char* GetKernelName(const hsa_kernel_dispatch_packet_t* dispatch_packet) { + static uint64_t GetKernelSymbol(const hsa_kernel_dispatch_packet_t* dispatch_packet) { const amd_kernel_code_t* kernel_code = NULL; hsa_status_t status = util::HsaRsrcFactory::Instance().LoaderApi()->hsa_ven_amd_loader_query_host_address( @@ -228,8 +248,12 @@ class InterceptQueue { if (HSA_STATUS_SUCCESS != status) { kernel_code = reinterpret_cast(dispatch_packet->kernel_object); } - amd_runtime_loader_debug_info_t* dbg_info = reinterpret_cast( - kernel_code->runtime_loader_kernel_symbol); + return kernel_code->runtime_loader_kernel_symbol; + } + + static const char* GetKernelName(const uint64_t kernel_symbol) { + amd_runtime_loader_debug_info_t* dbg_info = + reinterpret_cast(kernel_symbol); const char* kernel_name = (dbg_info != NULL) ? dbg_info->kernel_name : NULL; // Kernel name is mangled name @@ -288,18 +312,21 @@ class InterceptQueue { static mutex_t mutex_; static const packet_word_t header_type_mask = (1ul << HSA_PACKET_HEADER_WIDTH_TYPE) - 1; static rocprofiler_callback_t dispatch_callback_; + static queue_callback_t create_callback_; static queue_callback_t destroy_callback_; static void* callback_data_; static obj_map_t* obj_map_; static const char* kernel_none_; static Tracker* tracker_; static bool tracker_on_; - static bool in_constr_call_; + static bool in_create_call_; + static queue_id_t current_queue_id; hsa_queue_t* const queue_; ProxyQueue* const proxy_; const util::AgentInfo* agent_info_; queue_event_callback_t queue_event_callback_; + queue_id_t queue_id; }; } // namespace rocprofiler diff --git a/src/core/metrics.h b/src/core/metrics.h index 8f05a3e7..46806dcf 100644 --- a/src/core/metrics.h +++ b/src/core/metrics.h @@ -195,7 +195,7 @@ class MetricsDict { } static hsa_ven_amd_aqlprofile_id_query_t Translate(const util::AgentInfo* agent_info, const std::string& block_name) { - hsa_ven_amd_aqlprofile_profile_t profile; + hsa_ven_amd_aqlprofile_profile_t profile{}; profile.agent = agent_info->dev_id; hsa_ven_amd_aqlprofile_id_query_t query = {block_name.c_str(), 0, 0}; hsa_status_t status = diff --git a/src/core/rocprofiler.cpp b/src/core/rocprofiler.cpp index e8901387..6042e59e 100644 --- a/src/core/rocprofiler.cpp +++ b/src/core/rocprofiler.cpp @@ -56,6 +56,16 @@ THE SOFTWARE. // Internal library methods // namespace rocprofiler { +hsa_status_t CreateQueuePro( + hsa_agent_t agent, + uint32_t size, + hsa_queue_type32_t type, + void (*callback)(hsa_status_t status, hsa_queue_t *source, void *data), + void *data, + uint32_t private_segment_size, + uint32_t group_segment_size, + hsa_queue_t **queue); + decltype(hsa_queue_create)* hsa_queue_create_fn; decltype(hsa_queue_destroy)* hsa_queue_destroy_fn; @@ -115,6 +125,11 @@ void RestoreHsaApi() { table->amd_ext_->hsa_amd_queue_intercept_register_fn = hsa_amd_queue_intercept_register_fn; } +void StandaloneIntercept() { + ::HsaApiTable* table = kHsaApiTable; + table->core_->hsa_queue_create_fn = rocprofiler::CreateQueuePro; +} + typedef void (*tool_handler_t)(); typedef void (*tool_handler_prop_t)(rocprofiler_settings_t*); void * tool_handle = NULL; @@ -195,9 +210,7 @@ DESTRUCTOR_API void destructor() { const MetricsDict* GetMetrics(const hsa_agent_t& agent) { rocprofiler::util::HsaRsrcFactory* hsa_rsrc = &rocprofiler::util::HsaRsrcFactory::Instance(); const rocprofiler::util::AgentInfo* agent_info = hsa_rsrc->GetAgentInfo(agent); - if (agent_info == NULL) { - EXC_RAISING(HSA_STATUS_ERROR, "agent is not found"); - } + if (agent_info == NULL) EXC_RAISING(HSA_STATUS_ERROR, "agent is not found"); const MetricsDict* metrics = MetricsDict::Create(agent_info); if (metrics == NULL) EXC_RAISING(HSA_STATUS_ERROR, "MetricsDict create failed"); return metrics; @@ -209,6 +222,94 @@ hsa_status_t GetExcStatus(const std::exception& e) { : HSA_STATUS_ERROR; } + +inline size_t CreateEnableCmd(const hsa_agent_t& agent, packet_t* command, const size_t& slot_count) { + rocprofiler::util::HsaRsrcFactory* hsa_rsrc = &rocprofiler::util::HsaRsrcFactory::Instance(); + const rocprofiler::util::AgentInfo* agent_info = hsa_rsrc->GetAgentInfo(agent); + const bool is_legacy = (strncmp(agent_info->name, "gfx8", 4) == 0); + const size_t packet_count = (is_legacy) ? Profile::LEGACY_SLOT_SIZE_PKT : 1; + + if (packet_count > slot_count) EXC_RAISING(HSA_STATUS_ERROR, "packet_count > slot_count"); + + // AQLprofile object + hsa_ven_amd_aqlprofile_profile_t profile{}; + profile.agent = agent_info->dev_id; + // Query for cmd buffer size + hsa_status_t status = hsa_rsrc->AqlProfileApi()->hsa_ven_amd_aqlprofile_get_info( + &profile, HSA_VEN_AMD_AQLPROFILE_INFO_ENABLE_CMD, NULL); + if (status != HSA_STATUS_SUCCESS) EXC_RAISING(status, "get_info(ENABLE_CMD).size exc"); + if (profile.command_buffer.size == 0) EXC_RAISING(status, "get_info(ENABLE_CMD).size == 0"); + // Allocate cmd buffer + const size_t aligment_mask = 0x100 - 1; + profile.command_buffer.ptr = + hsa_rsrc->AllocateSysMemory(agent_info, profile.command_buffer.size); + if ((reinterpret_cast(profile.command_buffer.ptr) & aligment_mask) != 0) { + EXC_RAISING(status, "profile.command_buffer.ptr bad alignment"); + } + + // Generating cmd packet + if (is_legacy) { + packet_t packet{}; + + // Query for cmd buffer data + status = hsa_rsrc->AqlProfileApi()->hsa_ven_amd_aqlprofile_get_info( + &profile, HSA_VEN_AMD_AQLPROFILE_INFO_ENABLE_CMD, &packet); + if (status != HSA_STATUS_SUCCESS) EXC_RAISING(status, "get_info(ENABLE_CMD).data exc"); + + // Check for legacy GFXIP + status = hsa_rsrc->AqlProfileApi()->hsa_ven_amd_aqlprofile_legacy_get_pm4(&packet, command); + if (status != HSA_STATUS_SUCCESS) AQL_EXC_RAISING(status, "hsa_ven_amd_aqlprofile_legacy_get_pm4"); + } else { + // Query for cmd buffer data + status = hsa_rsrc->AqlProfileApi()->hsa_ven_amd_aqlprofile_get_info( + &profile, HSA_VEN_AMD_AQLPROFILE_INFO_ENABLE_CMD, command); + if (status != HSA_STATUS_SUCCESS) EXC_RAISING(status, "get_info(ENABLE_CMD).data exc"); + } + + // Return cmd packet data size + return (packet_count * sizeof(packet_t)); +} + +hsa_status_t CreateQueuePro( + hsa_agent_t agent, + uint32_t size, + hsa_queue_type32_t type, + void (*callback)(hsa_status_t status, hsa_queue_t *source, void *data), + void *data, + uint32_t private_segment_size, + uint32_t group_segment_size, + hsa_queue_t **queue) +{ + static packet_t enable_cmd_packet[Profile::LEGACY_SLOT_SIZE_PKT]; + static size_t enable_cmd_size = 0; + static std::mutex enable_cmd_mutex; + + // Create HSA queue + hsa_status_t status = hsa_queue_create_fn( + agent, + size, + type, + callback, + data, + private_segment_size, + group_segment_size, + queue); + if (status != HSA_STATUS_SUCCESS) return status; + + // Create 'Enable' cmd packet + if (enable_cmd_size == 0) { + std::lock_guard lck(enable_cmd_mutex); + if (enable_cmd_size == 0) { + enable_cmd_size = CreateEnableCmd(agent, enable_cmd_packet, Profile::LEGACY_SLOT_SIZE_PKT); + } + } + + // Enable counters for the queue + rocprofiler::util::HsaRsrcFactory::Instance().Submit(*queue, enable_cmd_packet, enable_cmd_size); + + return HSA_STATUS_SUCCESS; +} + rocprofiler_properties_t rocprofiler_properties; uint32_t SqttProfile::output_buffer_size_ = 0x2000000; // 32M bool SqttProfile::output_buffer_local_ = true; @@ -261,7 +362,10 @@ PUBLIC_API bool OnLoad(HsaApiTable* table, uint64_t runtime_version, uint64_t fa if (intercept_mode) { rocprofiler::ProxyQueue::HsaIntercept(table); rocprofiler::InterceptQueue::HsaIntercept(table); + } else { + rocprofiler::StandaloneIntercept(); } + return true; } @@ -419,14 +523,14 @@ PUBLIC_API hsa_status_t rocprofiler_get_metrics(const rocprofiler_t* handle) { // Set/remove queue callbacks PUBLIC_API hsa_status_t rocprofiler_set_queue_callbacks(rocprofiler_queue_callbacks_t callbacks, void* data) { API_METHOD_PREFIX - rocprofiler::InterceptQueue::SetCallbacks(callbacks.dispatch, callbacks.destroy, data); + rocprofiler::InterceptQueue::SetCallbacks(callbacks.dispatch, callbacks.create, callbacks.destroy, data); API_METHOD_SUFFIX } // Remove queue callbacks PUBLIC_API hsa_status_t rocprofiler_remove_queue_callbacks() { API_METHOD_PREFIX - rocprofiler::InterceptQueue::SetCallbacks(NULL, NULL, NULL); + rocprofiler::InterceptQueue::SetCallbacks(NULL, NULL, NULL, NULL); API_METHOD_SUFFIX } diff --git a/src/core/tracker.h b/src/core/tracker.h index acbf5cf6..ab7f3b5d 100644 --- a/src/core/tracker.h +++ b/src/core/tracker.h @@ -49,6 +49,7 @@ class Tracker { typedef sig_list_t::iterator sig_list_it_t; struct entry_t { + std::atomic valid; Tracker* tracker; sig_list_t::iterator it; hsa_agent_t agent; @@ -100,7 +101,7 @@ class Tracker { // Adding antry to the list mutex_.lock(); - entry->it = sig_list_.insert(sig_list_.begin(), entry); + entry->it = sig_list_.insert(sig_list_.end(), entry); mutex_.unlock(); return entry; @@ -142,7 +143,7 @@ class Tracker { void Erase(const sig_list_it_t& it) { Delete(*it); } // Entry completion - void Complete(entry_t* entry) { + inline void Complete(entry_t* entry) { record_t* record = entry->record; // Debug trace @@ -160,6 +161,7 @@ class Tracker { record->begin = hsa_rsrc_->SysclockToNs(dispatch_time.start); record->end = hsa_rsrc_->SysclockToNs(dispatch_time.end); record->complete = hsa_rsrc_->TimestampNs(); + entry->valid.store(true, std::memory_order_release); // Original intercepted signal completion hsa_signal_t orig = entry->orig; @@ -174,6 +176,19 @@ class Tracker { } } + inline static void HandleEntry(entry_t* entry) { + // Call entry handler + void* handler = static_cast(entry->handler); + if (entry->context_active) { + reinterpret_cast(handler)(0, entry->arg); + } else { + rocprofiler_group_t group{}; + reinterpret_cast(handler)(group, entry->arg); + } + // Delete tracker entry + entry->tracker->Delete(entry); + } + // Handler for packet completion static bool Handler(hsa_signal_value_t, void* arg) { // Acquire entry @@ -182,20 +197,31 @@ class Tracker { while (ptr->load(std::memory_order_acquire) == NULL) sched_yield(); // Complete entry - entry->tracker->Complete(entry); + Tracker* tracker = entry->tracker; + tracker->Complete(entry); - // Call entry handler - void* handler = static_cast(entry->handler); - if (entry->context_active) { - reinterpret_cast(handler)(0, entry->arg); + if (ordering_enabled_ == false) { + HandleEntry(entry); } else { - rocprofiler_group_t group{}; - reinterpret_cast(handler)(group, entry->arg); + // Acquire last entry + entry_t* back = tracker->sig_list_.back(); + volatile std::atomic* ptr = &back->handler; + while (ptr->load(std::memory_order_acquire) == NULL) sched_yield(); + + tracker->handler_mutex_.lock(); + sig_list_it_t it = tracker->sig_list_.begin(); + sig_list_it_t end = back->it; + while (it != end) { + entry = *(it++); + if (entry->valid.load(std::memory_order_acquire)) { + HandleEntry(entry); + } else { + break; + } + } + tracker->handler_mutex_.unlock(); } - // Delete tracker entry - entry->tracker->Delete(entry); - return false; } @@ -203,10 +229,13 @@ class Tracker { sig_list_t sig_list_; // Inter-thread synchronization mutex_t mutex_; + mutex_t handler_mutex_; // Outstanding dispatches std::atomic outstanding_; // HSA resources factory util::HsaRsrcFactory* hsa_rsrc_; + // Handling ordering enabled + static const bool ordering_enabled_ = true; // Enable tracing static const bool trace_on_ = false; }; diff --git a/src/core/types.h b/src/core/types.h index c58d6cf2..ef8600f0 100644 --- a/src/core/types.h +++ b/src/core/types.h @@ -26,7 +26,7 @@ THE SOFTWARE. #include namespace rocprofiler { -typedef hsa_ven_amd_aqlprofile_1_00_pfn_t pfn_t; +typedef hsa_ven_amd_aqlprofile_pfn_t pfn_t; typedef hsa_ven_amd_aqlprofile_event_t event_t; typedef hsa_ven_amd_aqlprofile_parameter_t parameter_t; typedef hsa_ven_amd_aqlprofile_profile_t profile_t; diff --git a/src/util/hsa_rsrc_factory.cpp b/src/util/hsa_rsrc_factory.cpp index 3c50d27d..2d64bae0 100644 --- a/src/util/hsa_rsrc_factory.cpp +++ b/src/util/hsa_rsrc_factory.cpp @@ -134,13 +134,13 @@ HsaRsrcFactory::HsaRsrcFactory(bool initialize_hsa) : initialize_hsa_(initialize #ifdef ROCP_LD_AQLPROFILE status = LoadAqlProfileLib(&aqlprofile_api_); #else - status = hsa_system_get_extension_table(HSA_EXTENSION_AMD_AQLPROFILE, 1, 0, &aqlprofile_api_); + status = hsa_system_get_major_extension_table(HSA_EXTENSION_AMD_AQLPROFILE, hsa_ven_amd_aqlprofile_VERSION_MAJOR, sizeof(aqlprofile_api_), &aqlprofile_api_); #endif CHECK_STATUS("aqlprofile API table load failed", status); // Get Loader API table loader_api_ = {0}; - status = hsa_system_get_extension_table(HSA_EXTENSION_AMD_LOADER, 1, 0, &loader_api_); + status = hsa_system_get_major_extension_table(HSA_EXTENSION_AMD_LOADER, 1, sizeof(loader_api_), &loader_api_); CHECK_STATUS("loader API table query failed", status); // Instantiate HSA timer @@ -527,6 +527,7 @@ bool HsaRsrcFactory::LoadAndFinalize(const AgentInfo* agent_info, const char* br // Print the various fields of Hsa Gpu Agents bool HsaRsrcFactory::PrintGpuAgents(const std::string& header) { + std::cout << std::flush; std::clog << header << " :" << std::endl; const AgentInfo* agent_info; @@ -550,7 +551,7 @@ bool HsaRsrcFactory::PrintGpuAgents(const std::string& header) { } uint64_t HsaRsrcFactory::Submit(hsa_queue_t* queue, const void* packet) { - const uint32_t slot_size_b = 0x40; + const uint32_t slot_size_b = CMD_SLOT_SIZE_B; // adevance command queue const uint64_t write_idx = hsa_queue_load_write_index_relaxed(queue); @@ -578,7 +579,7 @@ uint64_t HsaRsrcFactory::Submit(hsa_queue_t* queue, const void* packet) { } uint64_t HsaRsrcFactory::Submit(hsa_queue_t* queue, const void* packet, size_t size_bytes) { - const uint32_t slot_size_b = 0x40; + const uint32_t slot_size_b = CMD_SLOT_SIZE_B; if ((size_bytes & (slot_size_b - 1)) != 0) { fprintf(stderr, "HsaRsrcFactory::Submit: Bad packet size %zx\n", size_bytes); abort(); diff --git a/src/util/hsa_rsrc_factory.h b/src/util/hsa_rsrc_factory.h index c76046d2..9997a81c 100644 --- a/src/util/hsa_rsrc_factory.h +++ b/src/util/hsa_rsrc_factory.h @@ -123,7 +123,7 @@ struct AgentInfo { // HSA timer class // Provides current HSA timestampa and system-clock/ns conversion API class HsaTimer { - public: + public: typedef uint64_t timestamp_t; static const timestamp_t TIMESTAMP_MAX = UINT64_MAX; typedef long double freq_t; @@ -136,8 +136,12 @@ class HsaTimer { } // Methids for system-clock/ns conversion - timestamp_t sysclock_to_ns(const timestamp_t& sysclock) const { return timestamp_t((freq_t)sysclock * sysclock_factor_); } - timestamp_t ns_to_sysclock(const timestamp_t& time) const { return timestamp_t((freq_t)time / sysclock_factor_); } + timestamp_t sysclock_to_ns(const timestamp_t& sysclock) const { + return timestamp_t((freq_t)sysclock * sysclock_factor_); + } + timestamp_t ns_to_sysclock(const timestamp_t& time) const { + return timestamp_t((freq_t)time / sysclock_factor_); + } // Return timestamp in 'ns' timestamp_t timestamp_ns() const { @@ -147,13 +151,14 @@ class HsaTimer { return sysclock_to_ns(sysclock); } - private: + private: // Timestamp frequency factor freq_t sysclock_factor_; }; class HsaRsrcFactory { public: + static const size_t CMD_SLOT_SIZE_B = 0x40; typedef std::recursive_mutex mutex_t; typedef HsaTimer::timestamp_t timestamp_t; @@ -272,7 +277,7 @@ class HsaRsrcFactory { static uint64_t Submit(hsa_queue_t* queue, const void* packet, size_t size_bytes); // Return AqlProfile API table - typedef hsa_ven_amd_aqlprofile_1_00_pfn_t aqlprofile_pfn_t; + typedef hsa_ven_amd_aqlprofile_pfn_t aqlprofile_pfn_t; const aqlprofile_pfn_t* AqlProfileApi() const { return &aqlprofile_api_; } // Return Loader API table diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index 2f35639d..c7d86ccf 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -35,19 +35,47 @@ endif () ## Util sources file( GLOB UTIL_SRC "${TEST_DIR}/util/*.cpp" ) +## Standalone test sources +set ( STEXE_NAME "standalone_test" ) +set ( STTST_SRC + ${TEST_DIR}/app/standalone_test.cpp + ${TEST_DIR}/ctrl/test_hsa.cpp +) + +## Intercept test sources +set ( INEXE_NAME "intercept_test" ) +set ( INTST_SRC + ${TEST_DIR}/app/intercept_test.cpp + ${TEST_DIR}/ctrl/test_hsa.cpp +) + ## Test control sources set ( CTRL_SRC ${TEST_DIR}/app/test.cpp ${TEST_DIR}/ctrl/test_hsa.cpp ) -## Test kernel sources +## Dummy kernel +set ( DUMMY_NAME dummy_kernel ) +execute_process ( COMMAND sh -xc "cp ${TEST_DIR}/${DUMMY_NAME}/*.hsaco ${PROJECT_BINARY_DIR}" ) + +## Test kernel set ( TEST_NAME simple_convolution ) set ( KERN_SRC ${TEST_DIR}/${TEST_NAME}/${TEST_NAME}.cpp ) execute_process ( COMMAND sh -xc "cp ${TEST_DIR}/${TEST_NAME}/*.hsaco ${PROJECT_BINARY_DIR}" ) -## Building test executable -add_executable ( ${EXE_NAME} ${KERN_SRC} ${CTRL_SRC} ${UTIL_SRC} ) +## Building standalone test executable +add_executable ( ${STEXE_NAME} ${STTST_SRC} ${UTIL_SRC} ${KERN_SRC} ) +target_include_directories ( ${STEXE_NAME} PRIVATE ${TEST_DIR} ${ROOT_DIR} ${HSA_RUNTIME_INC_PATH} ) +target_link_libraries( ${STEXE_NAME} ${ROCPROFILER_TARGET} ${HSA_RUNTIME_LIB} ${HSA_KMT_LIB} c stdc++ dl pthread rt ) + +## Building intercept test executable +add_library ( ${INEXE_NAME} SHARED ${INTST_SRC} ${UTIL_SRC} ${KERN_SRC} ) +target_include_directories ( ${INEXE_NAME} PRIVATE ${TEST_DIR} ${ROOT_DIR} ${HSA_RUNTIME_INC_PATH} ) +target_link_libraries( ${INEXE_NAME} ${ROCPROFILER_TARGET} ${HSA_RUNTIME_LIB} ${HSA_KMT_LIB} c stdc++ dl pthread rt ) + +## Building ctrl test executable +add_executable ( ${EXE_NAME} ${CTRL_SRC} ${UTIL_SRC} ${KERN_SRC} ) target_include_directories ( ${EXE_NAME} PRIVATE ${TEST_DIR} ${ROOT_DIR} ${HSA_RUNTIME_INC_PATH} ) target_link_libraries( ${EXE_NAME} ${HSA_RUNTIME_LIB} ${HSA_KMT_LIB} c stdc++ dl pthread rt ) execute_process ( COMMAND sh -xc "cp ${TEST_DIR}/run.sh ${PROJECT_BINARY_DIR}" ) diff --git a/test/app/intercept_test.cpp b/test/app/intercept_test.cpp new file mode 100644 index 00000000..87e00d64 --- /dev/null +++ b/test/app/intercept_test.cpp @@ -0,0 +1,231 @@ +/****************************************************************************** +Copyright (c) 2018 Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*******************************************************************************/ + +#include +#include +#include +#include + +#include +#include +#include + +#include "ctrl/run_kernel.h" +#include "ctrl/test_aql.h" +#include "ctrl/test_hsa.h" +#include "inc/rocprofiler.h" +#include "dummy_kernel/dummy_kernel.h" +#include "simple_convolution/simple_convolution.h" +#include "util/test_assert.h" + +#define PUBLIC_API __attribute__((visibility("default"))) +#define CONSTRUCTOR_API __attribute__((constructor)) +#define DESTRUCTOR_API __attribute__((destructor)) + +// Dispatch callbacks and context handlers synchronization +pthread_mutex_t mutex = PTHREAD_RECURSIVE_MUTEX_INITIALIZER_NP; +// Tool is unloaded +volatile bool is_loaded = false; + +// Error handler +void fatal(const std::string msg) { + fflush(stdout); + fprintf(stderr, "%s\n\n", msg.c_str()); + fflush(stderr); + abort(); +} + +// Check returned HSA API status +void check_status(hsa_status_t status) { + if (status != HSA_STATUS_SUCCESS) { + const char* error_string = NULL; + rocprofiler_error_string(&error_string); + fprintf(stderr, "ERROR: %s\n", error_string); + abort(); + } +} + +// Context stored entry type +struct context_entry_t { + bool valid; + hsa_agent_t agent; + rocprofiler_group_t group; + rocprofiler_callback_data_t data; +}; + +// Dump stored context entry +void dump_context_entry(context_entry_t* entry) { + volatile std::atomic* valid = reinterpret_cast*>(&entry->valid); + while (valid->load() == false) sched_yield(); + + const std::string kernel_name = entry->data.kernel_name; + const rocprofiler_dispatch_record_t* record = entry->data.record; + + fflush(stdout); + fprintf(stdout, "kernel symbol(0x%lx) name(\"%s\") tid(%ld) queue-id(%u) gpu-id(%u) ", + entry->data.kernel_object, + kernel_name.c_str(), + entry->data.thread_id, + entry->data.queue_id, + HsaRsrcFactory::Instance().GetAgentInfo(entry->agent)->dev_index); + if (record) fprintf(stdout, "time(%lu,%lu,%lu,%lu)", + record->dispatch, + record->begin, + record->end, + record->complete); + fprintf(stdout, "\n"); + fflush(stdout); + + rocprofiler_group_t& group = entry->group; + if (group.context == NULL) { + fprintf(stderr, "tool error: context is NULL\n"); + abort(); + } + + rocprofiler_close(group.context); +} + +// Profiling completion handler +// Dump and delete the context entry +// Return true if the context was dumped successfully +bool context_handler(rocprofiler_group_t group, void* arg) { + context_entry_t* entry = reinterpret_cast(arg); + + if (pthread_mutex_lock(&mutex) != 0) { + perror("pthread_mutex_lock"); + abort(); + } + + dump_context_entry(entry); + delete entry; + + if (pthread_mutex_unlock(&mutex) != 0) { + perror("pthread_mutex_unlock"); + abort(); + } + + return false; +} + +// Kernel disoatch callback +hsa_status_t dispatch_callback(const rocprofiler_callback_data_t* callback_data, void* /*user_data*/, + rocprofiler_group_t* group) { + // HSA status + hsa_status_t status = HSA_STATUS_ERROR; + + // Profiling context + rocprofiler_t* context = NULL; + + // Context entry + context_entry_t* entry = new context_entry_t(); + + // context properties + rocprofiler_properties_t properties{}; + properties.handler = context_handler; + properties.handler_arg = (void*)entry; + + // Open profiling context + status = rocprofiler_open(callback_data->agent, NULL, 0, + &context, 0 /*ROCPROFILER_MODE_SINGLEGROUP*/, &properties); + check_status(status); + + // Get group[0] + status = rocprofiler_get_group(context, 0, group); + check_status(status); + + // Fill profiling context entry + entry->agent = callback_data->agent; + entry->group = *group; + entry->data = *callback_data; + entry->data.kernel_name = strdup(callback_data->kernel_name); + reinterpret_cast*>(&entry->valid)->store(true); + + return HSA_STATUS_SUCCESS; +} + +void initialize() { + // Getting GPU device info + const AgentInfo* agent_info = NULL; + if (HsaRsrcFactory::Instance().GetGpuAgentInfo(0, &agent_info) == false) { + fprintf(stderr, "GetGpuAgentInfo failed\n"); + abort(); + } + + // Adding dispatch observer + rocprofiler_queue_callbacks_t callbacks_ptrs{}; + callbacks_ptrs.dispatch = dispatch_callback; + rocprofiler_set_queue_callbacks(callbacks_ptrs, NULL); +} + +void cleanup() { + // Unregister dispatch callback + rocprofiler_remove_queue_callbacks(); + + // Dump stored profiling output data + fflush(stdout); +} + +// Tool constructor +extern "C" PUBLIC_API void OnLoadToolProp(rocprofiler_settings_t* settings) +{ + if (pthread_mutex_lock(&mutex) != 0) { + perror("pthread_mutex_lock"); + abort(); + } + if (is_loaded) return; + is_loaded = true; + if (pthread_mutex_unlock(&mutex) != 0) { + perror("pthread_mutex_unlock"); + abort(); + } + + // Enable timestamping + settings->timestamp_on = true; + + // Initialize profiling + initialize(); +} + +// Tool destructor +extern "C" PUBLIC_API void OnUnloadTool() { + if (pthread_mutex_lock(&mutex) != 0) { + perror("pthread_mutex_lock"); + abort(); + } + if (!is_loaded) return; + is_loaded = false; + if (pthread_mutex_unlock(&mutex) != 0) { + perror("pthread_mutex_unlock"); + abort(); + } + + // Final resources cleanup + cleanup(); +} + +extern "C" CONSTRUCTOR_API void constructor() { + printf("INTT constructor\n"); fflush(stdout); +} + +extern "C" DESTRUCTOR_API void destructor() { + if (is_loaded == true) OnUnloadTool(); +} diff --git a/test/app/standalone_test.cpp b/test/app/standalone_test.cpp new file mode 100644 index 00000000..f6fc965e --- /dev/null +++ b/test/app/standalone_test.cpp @@ -0,0 +1,163 @@ +/****************************************************************************** +Copyright (c) 2018 Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*******************************************************************************/ + +#include +#include +#include +#include + +#include "ctrl/run_kernel.h" +#include "ctrl/test_aql.h" +#include "ctrl/test_hsa.h" +#include "inc/rocprofiler.h" +#include "dummy_kernel/dummy_kernel.h" +#include "simple_convolution/simple_convolution.h" +#include "util/test_assert.h" + +void print_features(rocprofiler_feature_t* feature, uint32_t feature_count) { + for (rocprofiler_feature_t* p = feature; p < feature + feature_count; ++p) { + std::cout << (p - feature) << ": " << p->name; + switch (p->data.kind) { + case ROCPROFILER_DATA_KIND_INT64: + std::cout << std::dec << " result64 (" << p->data.result_int64 << ")" << std::endl; + break; + case ROCPROFILER_DATA_KIND_BYTES: { + const char* ptr = reinterpret_cast(p->data.result_bytes.ptr); + uint64_t size = 0; + for (unsigned i = 0; i < p->data.result_bytes.instance_count; ++i) { + size = *reinterpret_cast(ptr); + const char* data = ptr + sizeof(size); + std::cout << std::endl; + std::cout << std::hex << " data (" << (void*)data << ")" << std::endl; + std::cout << std::dec << " size (" << size << ")" << std::endl; + ptr = data + size; + } + break; + } + default: + std::cout << "result kind (" << p->data.kind << ")" << std::endl; + TEST_ASSERT(false); + } + } +} + +void read_features(uint32_t n, rocprofiler_t* context, rocprofiler_feature_t* feature, const unsigned feature_count) { + std::cout << "read features" << std::endl; + hsa_status_t status = rocprofiler_read(context, n); + TEST_STATUS(status == HSA_STATUS_SUCCESS); + std::cout << "read issue" << std::endl; + status = rocprofiler_get_data(context, n); + TEST_STATUS(status == HSA_STATUS_SUCCESS); + status = rocprofiler_get_metrics(context); + TEST_STATUS(status == HSA_STATUS_SUCCESS); + print_features(feature, feature_count); +} + +int main() { + bool ret_val = false; + // HSA status + hsa_status_t status = HSA_STATUS_ERROR; + // Profiling context + rocprofiler_t* context = NULL; + // Profiling properties + rocprofiler_properties_t properties; + + // Profiling feature objects + const unsigned feature_count = 9; + rocprofiler_feature_t feature[feature_count]; + // PMC events + memset(feature, 0, sizeof(feature)); + feature[0].kind = ROCPROFILER_FEATURE_KIND_METRIC; + feature[0].name = "GRBM_COUNT"; + feature[1].kind = ROCPROFILER_FEATURE_KIND_METRIC; + feature[1].name = "GRBM_GUI_ACTIVE"; + feature[2].kind = ROCPROFILER_FEATURE_KIND_METRIC; + feature[2].name = "GPUBusy"; + feature[3].kind = ROCPROFILER_FEATURE_KIND_METRIC; + feature[3].name = "SQ_WAVES"; + feature[4].kind = ROCPROFILER_FEATURE_KIND_METRIC; + feature[4].name = "SQ_INSTS_VALU"; + feature[5].kind = ROCPROFILER_FEATURE_KIND_METRIC; + feature[5].name = "VALUInsts"; + feature[6].kind = ROCPROFILER_FEATURE_KIND_METRIC; + feature[6].name = "TCC_HIT_sum"; + feature[7].kind = ROCPROFILER_FEATURE_KIND_METRIC; + feature[7].name = "TCC_MISS_sum"; + feature[8].kind = ROCPROFILER_FEATURE_KIND_METRIC; + feature[8].name = "WRITE_SIZE"; + + // Instantiate HSA resources + HsaRsrcFactory::Create(); + + // Getting GPU device info + const AgentInfo* agent_info = NULL; + if (HsaRsrcFactory::Instance().GetGpuAgentInfo(0, &agent_info) == false) abort(); + + // Creating the queues pool + const unsigned queue_count = 16; + hsa_queue_t* queue[queue_count]; + for (unsigned queue_ind = 0; queue_ind < queue_count; ++queue_ind) { + if (HsaRsrcFactory::Instance().CreateQueue(agent_info, 128, &queue[queue_ind]) == false) abort(); + } + hsa_queue_t* prof_queue = queue[0]; + + // Creating profiling context + properties = {}; + properties.queue = prof_queue; + status = rocprofiler_open(agent_info->dev_id, feature, feature_count, &context, + ROCPROFILER_MODE_STANDALONE, &properties); + TEST_STATUS(status == HSA_STATUS_SUCCESS); + + // Test initialization + TestHsa::HsaInstantiate(); + + // Dispatching profiled kernel n-times to collect all counter groups data + const unsigned group_n = 0; + status = rocprofiler_start(context, group_n); + TEST_STATUS(status == HSA_STATUS_SUCCESS); + std::cout << "start" << std::endl; + + for (unsigned ind = 0; ind < 3; ++ind) { +#if 1 + const unsigned queue_ind = ind % queue_count; + hsa_queue_t* prof_queue = queue[queue_ind]; + //ret_val = RunKernel(0, NULL, NULL, prof_queue); + ret_val = RunKernel(0, NULL, NULL, prof_queue); + std::cout << "run kernel, queue " << queue_ind << std::endl; +#else + sleep(3); +#endif + read_features(group_n, context, feature, feature_count); + } + + // Stop counters + status = rocprofiler_stop(context, group_n); + TEST_STATUS(status == HSA_STATUS_SUCCESS); + std::cout << "stop" << std::endl; + + // Finishing cleanup + // Deleting profiling context will delete all allocated resources + status = rocprofiler_close(context); + TEST_STATUS(status == HSA_STATUS_SUCCESS); + + return (ret_val) ? 0 : 1; +} diff --git a/test/app/test.cpp b/test/app/test.cpp index 9e694833..796ba1eb 100644 --- a/test/app/test.cpp +++ b/test/app/test.cpp @@ -21,20 +21,66 @@ THE SOFTWARE. *******************************************************************************/ #include +#include #include #include +#include #include "ctrl/run_kernel.h" #include "ctrl/test_aql.h" +#include "dummy_kernel/dummy_kernel.h" #include "simple_convolution/simple_convolution.h" +void thread_fun(const int kiter, const int diter, const uint32_t agents_number) { + const AgentInfo* agent_info[agents_number]; + hsa_queue_t* queue[agents_number]; + HsaRsrcFactory* rsrc = &HsaRsrcFactory::Instance(); + + for (uint32_t n = 0; n < agents_number; ++n) { + uint32_t agent_id = n % rsrc->GetCountOfGpuAgents(); + if (rsrc->GetGpuAgentInfo(agent_id, &agent_info[n]) == false) { + fprintf(stderr, "AgentInfo failed\n"); + abort(); + } + if (rsrc->CreateQueue(agent_info[n], 128, &queue[n]) == false) { + fprintf(stderr, "CreateQueue failed\n"); + abort(); + } + } + + for (int i = 0; i < kiter; ++i) { + for (uint32_t n = 0; n < agents_number; ++n) { + // RunKernel(0, NULL, agent_info[n], queue[n], diter); + RunKernel(0, NULL, agent_info[n], queue[n], diter); + } + } + + for (uint32_t n = 0; n < agents_number; ++n) { + hsa_queue_destroy(queue[n]); + } +} + int main(int argc, char** argv) { const char* kiter_s = getenv("ROCP_KITER"); const char* diter_s = getenv("ROCP_DITER"); + const char* agents_s = getenv("ROCP_AGENTS"); + const char* thrs_s = getenv("ROCP_THRS"); + const int kiter = (kiter_s != NULL) ? atol(kiter_s) : 1; const int diter = (diter_s != NULL) ? atol(diter_s) : 1; + const uint32_t agents_number = (agents_s != NULL) ? (uint32_t)atol(agents_s) : 1; + const int thrs = (thrs_s != NULL) ? atol(thrs_s) : 1; + TestHsa::HsaInstantiate(); - for (int i = 0; i < kiter; ++i) RunKernel(argc, argv, diter); + + std::thread t[thrs]; + for (int n = 0; n < thrs; ++n) { + t[n] = std::thread(thread_fun, kiter, diter, agents_number); + } + for (int n = 0; n < thrs; ++n) { + t[n].join(); + } + TestHsa::HsaShutdown(); return 0; } diff --git a/test/ctrl/run_kernel.h b/test/ctrl/run_kernel.h index b122664b..846e0b68 100644 --- a/test/ctrl/run_kernel.h +++ b/test/ctrl/run_kernel.h @@ -26,13 +26,20 @@ THE SOFTWARE. #include "ctrl/test_hsa.h" #include "util/test_assert.h" -template bool RunKernel(int argc, char* argv[], int count = 1) { +template bool RunKernel(int argc = 0, char* argv[] = NULL, const AgentInfo* agent_info = NULL, hsa_queue_t* queue = NULL, int count = 1) { bool ret_val = false; + if (getenv("ROC_TEST_TRACE") == NULL) std::clog.rdbuf(NULL); + + // Create test kernel object Kernel test_kernel; - TestAql* test_aql = new TestHsa(&test_kernel); - test_aql = new Test(test_aql); + + TestHsa* test_hsa = new TestHsa(&test_kernel); + test_hsa->SetAgentInfo(agent_info); + test_hsa->SetQueue(queue); + + TestAql* test_aql = new Test(test_hsa); TEST_ASSERT(test_aql != NULL); if (test_aql == NULL) return 1; diff --git a/test/ctrl/test_hsa.cpp b/test/ctrl/test_hsa.cpp index 87861821..d006d19c 100644 --- a/test/ctrl/test_hsa.cpp +++ b/test/ctrl/test_hsa.cpp @@ -29,60 +29,54 @@ THE SOFTWARE. #include "util/hsa_rsrc_factory.h" HsaRsrcFactory* TestHsa::hsa_rsrc_ = NULL; -const AgentInfo* TestHsa::agent_info_ = NULL; -hsa_queue_t* TestHsa::hsa_queue_ = NULL; -uint32_t TestHsa::agent_id_ = 0; -HsaRsrcFactory* TestHsa::HsaInstantiate(const uint32_t agent_ind) { +HsaRsrcFactory* TestHsa::HsaInstantiate() { // Instantiate an instance of Hsa Resources Factory if (hsa_rsrc_ == NULL) { - agent_id_ = agent_ind; - hsa_rsrc_ = HsaRsrcFactory::Create(); - // Print properties of the agents hsa_rsrc_->PrintGpuAgents("> GPU agents"); - - // Create an instance of Gpu agent - if (!hsa_rsrc_->GetGpuAgentInfo(agent_ind, &agent_info_)) { - agent_info_ = NULL; - std::cerr << "> error: agent[" << agent_ind << "] is not found" << std::endl; - return NULL; - } - std::clog << "> Using agent[" << agent_ind << "] : " << agent_info_->name << std::endl; - - // Create an instance of Aql Queue - if (hsa_queue_ == NULL) { - uint32_t num_pkts = 128; - if (hsa_rsrc_->CreateQueue(agent_info_, num_pkts, &hsa_queue_) == false) { - hsa_queue_ = NULL; - TEST_ASSERT(false); - } - } } return hsa_rsrc_; } void TestHsa::HsaShutdown() { - if (hsa_queue_ != NULL) { - hsa_queue_destroy(hsa_queue_); - hsa_queue_ = NULL; - } if (hsa_rsrc_) hsa_rsrc_->Destroy(); } -bool TestHsa::Initialize(int arg_cnt, char** arg_list) { +bool TestHsa::Initialize(int /*arg_cnt*/, char** /*arg_list*/) { std::clog << "TestHsa::Initialize :" << std::endl; // Instantiate a Timer object setup_timer_idx_ = hsa_timer_.CreateTimer(); dispatch_timer_idx_ = hsa_timer_.CreateTimer(); - if (HsaInstantiate(agent_id_) == NULL) { + if (hsa_rsrc_ == NULL) { TEST_ASSERT(false); return false; } + // Create an instance of Gpu agent + if (agent_info_ == NULL) { + const uint32_t agent_id = 0; + if (!hsa_rsrc_->GetGpuAgentInfo(agent_id, &agent_info_)) { + agent_info_ = NULL; + std::cerr << "> error: agent[" << agent_id << "] is not found" << std::endl; + return NULL; + } + } + std::clog << "> Using agent[" << agent_info_->dev_index << "] : " << agent_info_->name << std::endl; + + // Create an instance of Aql Queue + if (hsa_queue_ == NULL) { + const uint32_t num_pkts = 128; + if (hsa_rsrc_->CreateQueue(agent_info_, num_pkts, &hsa_queue_) == false) { + hsa_queue_ = NULL; + TEST_ASSERT(false); + } + my_queue_ = true; + } + // Obtain handle of signal hsa_rsrc_->CreateSignal(1, &hsa_signal_); @@ -119,6 +113,8 @@ bool TestHsa::Setup() { mem_map_t& mem_map = test_->GetMemMap(); for (mem_it_t it = mem_map.begin(); it != mem_map.end(); ++it) { mem_descr_t& des = it->second; + if (des.size == 0) continue; + switch (des.id) { case TestKernel::LOCAL_DES_ID: des.ptr = hsa_rsrc_->AllocateLocalMemory(agent_info_, des.size); @@ -220,7 +216,7 @@ bool TestHsa::Run() { // Submit AQL packet to the queue const uint64_t que_idx = hsa_rsrc_->Submit(hsa_queue_, &aql); - std::clog << "> Waiting on kernel dispatch signal, que_idx=" << que_idx << std::endl; + std::clog << "> Waiting on kernel dispatch signal, que_idx=" << que_idx << std::endl << std::flush; // Wait on the dispatch signal until the kernel is finished. // Update wait condition to HSA_WAIT_STATE_ACTIVE for Polling @@ -245,6 +241,8 @@ bool TestHsa::VerifyResults() { const uint32_t size = test_->GetOutputSize(); bool suc = false; + if (size == 0) return true; + // Copy local kernel output buffers from local memory into host memory if (test_->IsOutputLocal()) { output = hsa_rsrc_->AllocateSysMemory(agent_info_, size); @@ -279,5 +277,8 @@ void TestHsa::PrintTime() { bool TestHsa::Cleanup() { hsa_executable_destroy(hsa_exec_); hsa_signal_destroy(hsa_signal_); + if (my_queue_) hsa_queue_destroy(hsa_queue_); + hsa_queue_ = NULL; + agent_info_ = NULL; return true; } diff --git a/test/ctrl/test_hsa.h b/test/ctrl/test_hsa.h index 84080e77..b5df8b69 100644 --- a/test/ctrl/test_hsa.h +++ b/test/ctrl/test_hsa.h @@ -32,23 +32,27 @@ THE SOFTWARE. class TestHsa : public TestAql { public: // Instantiate HSA resources - static HsaRsrcFactory* HsaInstantiate(const uint32_t agent_ind = agent_id_); + static HsaRsrcFactory* HsaInstantiate(); static void HsaShutdown(); - static void SetQueue(hsa_queue_t* queue) { hsa_queue_ = queue; } - static uint32_t HsaAgentId() { return agent_id_; } // Constructor explicit TestHsa(TestKernel* test) : test_(test), name_(test->Name()) { total_time_taken_ = 0; setup_time_taken_ = 0; dispatch_time_taken_ = 0; + agent_info_ = NULL; + hsa_queue_ = NULL; + my_queue_ = false; hsa_exec_ = {}; } // Get methods for Agent Info, HAS queue, HSA Resourcse Manager + HsaRsrcFactory* GetRsrcFactory() { return hsa_rsrc_; } + hsa_agent_t HsaAgent() { return agent_info_->dev_id; } const AgentInfo* GetAgentInfo() { return agent_info_; } + void SetAgentInfo(const AgentInfo* agent_info) { agent_info_ = agent_info; } hsa_queue_t* GetQueue() { return hsa_queue_; } - HsaRsrcFactory* GetRsrcFactory() { return hsa_rsrc_; } + void SetQueue(hsa_queue_t* queue) { hsa_queue_ = queue; } // Initialize application environment including setting // up of various configuration parameters based on @@ -105,14 +109,12 @@ class TestHsa : public TestAql { // Instance of Hsa Resources Factory static HsaRsrcFactory* hsa_rsrc_; - // GPU id - static uint32_t agent_id_; - // Handle to an Hsa Gpu Agent - static const AgentInfo* agent_info_; + const AgentInfo* agent_info_; // Handle to an Hsa Queue - static hsa_queue_t* hsa_queue_; + hsa_queue_t* hsa_queue_; + bool my_queue_; // Test kernel name std::string name_; diff --git a/test/dummy_kernel/dummy_kernel.cl b/test/dummy_kernel/dummy_kernel.cl new file mode 100644 index 00000000..4ab159c8 --- /dev/null +++ b/test/dummy_kernel/dummy_kernel.cl @@ -0,0 +1,28 @@ +/****************************************************************************** +Copyright (c) 2018 Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*******************************************************************************/ + +/** + dummy kernel + */ +__kernel void DummyKernel() { + uint tid = get_global_id(0); +} diff --git a/test/dummy_kernel/dummy_kernel.h b/test/dummy_kernel/dummy_kernel.h new file mode 100644 index 00000000..1b8ce430 --- /dev/null +++ b/test/dummy_kernel/dummy_kernel.h @@ -0,0 +1,71 @@ +/****************************************************************************** +Copyright (c) 2018 Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*******************************************************************************/ + +#ifndef TEST_DUMMY_KERNEL_DUMMY_KERNEL_H_ +#define TEST_DUMMY_KERNEL_DUMMY_KERNEL_H_ + +#include +#include + +#include "ctrl/test_kernel.h" + +// Class implements DummyKernel kernel parameters +class DummyKernel : public TestKernel { + public: + // Kernel buffers IDs + enum { KERNARG_BUF_ID, LOCAL_BUF_ID }; + + // Constructor + DummyKernel() : + width_(64), + height_(64) + { + SetInDescr(KERNARG_BUF_ID, KERNARG_DES_ID, 0); + SetOutDescr(LOCAL_BUF_ID, LOCAL_DES_ID, 0); + } + + // Initialize method + void Init() {} + + // Return compute grid size + uint32_t GetGridSize() const { return width_ * height_; } + + // Print output + void PrintOutput(const void* ptr) const {} + + // Return name + std::string Name() const { return std::string("DummyKernel"); } + + private: + // Reference CPU implementation + bool ReferenceImplementation(uint32_t* output, const uint32_t* input, const float* mask, + const uint32_t width, const uint32_t height, + const uint32_t maskWidth, const uint32_t maskHeight) { return true; } + + // Width of the Input array + const uint32_t width_; + + // Height of the Input array + const uint32_t height_; +}; + +#endif // TEST_DUMMY_KERNEL_DUMMY_KERNEL_H_ diff --git a/test/dummy_kernel/gfx8_DummyKernel.hsaco b/test/dummy_kernel/gfx8_DummyKernel.hsaco new file mode 100755 index 0000000000000000000000000000000000000000..35866785c020e2fbdae760ccd0768cf680fc3f5d GIT binary patch literal 10952 zcmeI2O>7&-6~~7kp(vUbXoX@`MKRW{DyK$7NK4A0gQAsV$5s$JQ6e{~lN2&7t|iL+ zDn%Jm5Up3PAu%!$s6Y-zFVU$$4nckJp+yVIHhOS@9MZx-jX(u*aDf8NrG;Cdhe~~K z- zq(l(D!y+UC(DvJm`iICsFO^$%Fd?K3GDnQ#b}MNN3`JV02gf~u9js5aWxG=dX=Oi- zvqNe)7>Z_pJGRVCY2`RC@UJpXuJ83fRO80q%WE)>mrme6{KmQPa}x#c zV6Xd#OZ%wYqbKU=!j*HL{VZ@V@Pod9@b5z}A@UpuyX6PVZk zhS;~SuP^k5QJKz_iqC7ZysZs9tMzL?2$BBjt)7sk^=YqU3dR%=HA_k zh^{H_f=^4R(22PvRR{eDqlLE$-jEx!n9HSz7xlGJ~LN#=6UHz z@_#@7*~dKpTmOFkg2;N@=FdJtuRsMs|Dm zNF{eEQ#E#X*dCh?u{<1aW-b{QDkV9mra1@dbY}KsK2v?ORJlkB>}{hD9tp+caXdNP z;%30jfSUm~18xS~47eF^Gw^?(0eR2nk#|Zv-k3En<7eV7G(Gm+P&41XuKE3eO(?s2 zJkFklH6iM2cf?}jwl`u1!co0tUn~_AE5BQadc{Z4SWoNM+Ml@+xifEhqvn*h*f0Z& zwbP=uw&<{0!wlAH2Z1d`?#F(C8{rgiZozLi2*WpxK$NKzK+XqW!&?&@w>tVPj zkAi;(j5pgh-v7(y`)gpA%<#q%YBWO|OTh!EX~Wj^BSq8QucYaWt*OSEYMnixsaMfd zwKcVarY9ev3G^%l4lFh-Z_G3U8?`v(!Gau4X=jIdvG0~f60pe1hg+U zF2Xw2!ni*orsZviJwofj<7|m*jmsi*Bebo*-Y~=Unq_m09Wmi|3OjcAz;XkQU+q@- z-aP!qVY6G|&*x=-^RZoR@>y;60ex0GYO6K6Q9}^4dFqWRZMhMHwzoB``+PCjf9M5DSDi=`GKox0WDl?v^J{THtD{yVoGgd~T-e=m%OJxBWw z_8w}(9AJK|2+fb@#AYzu`vsi;-n}p{%oniFgXKmS^hq74Z%fBJr~|gSZ6m7@I1kzn zr&d{)qsK$%Loh7F<=geO2%H;1``n1Yxk1{##B+nS*}7=&Em<)-cAcQ%PB7T(Z4H99 zP|9y>t2Ookw>--YA%@rC_p0C(zhAckfz%Zd)-5mOtQB$+TIQTguEP(aBH$Bdi_eSa zALRt{A@>TNyZfKEAf8*?47eF^GvH>x&48N$Hv?`4+zdRv3^>o4`MEJZ55>|+hMu)U zz0UpeF%Q2EApW+oSwh!g?&rtBBF}9Os5Dyd%w<=Hq;-SJjnB%=*bLnQTDf}DE&nf&c^GSsdFwZMI z#k{QW?=rU(evbSC`f|) zlJ;x~ij|;P35u1VSP63X-HC)HO(>2Wj*mjUFT=gA`TSMeZX-n zuE8$ac3bAAoLA;gPICnMasGbxlcuautw#d3a{bcc^@m)0v+H$@T0{I^CGHWNOU^Hk m|0!UOb{b-T-~T2x&NeZu%6TPz4Jtd0ALu2I50wE*Q~V3w3`%VP literal 0 HcmV?d00001 diff --git a/test/dummy_kernel/gfx9_DummyKernel.hsaco b/test/dummy_kernel/gfx9_DummyKernel.hsaco new file mode 100755 index 0000000000000000000000000000000000000000..35866785c020e2fbdae760ccd0768cf680fc3f5d GIT binary patch literal 10952 zcmeI2O>7&-6~~7kp(vUbXoX@`MKRW{DyK$7NK4A0gQAsV$5s$JQ6e{~lN2&7t|iL+ zDn%Jm5Up3PAu%!$s6Y-zFVU$$4nckJp+yVIHhOS@9MZx-jX(u*aDf8NrG;Cdhe~~K z- zq(l(D!y+UC(DvJm`iICsFO^$%Fd?K3GDnQ#b}MNN3`JV02gf~u9js5aWxG=dX=Oi- zvqNe)7>Z_pJGRVCY2`RC@UJpXuJ83fRO80q%WE)>mrme6{KmQPa}x#c zV6Xd#OZ%wYqbKU=!j*HL{VZ@V@Pod9@b5z}A@UpuyX6PVZk zhS;~SuP^k5QJKz_iqC7ZysZs9tMzL?2$BBjt)7sk^=YqU3dR%=HA_k zh^{H_f=^4R(22PvRR{eDqlLE$-jEx!n9HSz7xlGJ~LN#=6UHz z@_#@7*~dKpTmOFkg2;N@=FdJtuRsMs|Dm zNF{eEQ#E#X*dCh?u{<1aW-b{QDkV9mra1@dbY}KsK2v?ORJlkB>}{hD9tp+caXdNP z;%30jfSUm~18xS~47eF^Gw^?(0eR2nk#|Zv-k3En<7eV7G(Gm+P&41XuKE3eO(?s2 zJkFklH6iM2cf?}jwl`u1!co0tUn~_AE5BQadc{Z4SWoNM+Ml@+xifEhqvn*h*f0Z& zwbP=uw&<{0!wlAH2Z1d`?#F(C8{rgiZozLi2*WpxK$NKzK+XqW!&?&@w>tVPj zkAi;(j5pgh-v7(y`)gpA%<#q%YBWO|OTh!EX~Wj^BSq8QucYaWt*OSEYMnixsaMfd zwKcVarY9ev3G^%l4lFh-Z_G3U8?`v(!Gau4X=jIdvG0~f60pe1hg+U zF2Xw2!ni*orsZviJwofj<7|m*jmsi*Bebo*-Y~=Unq_m09Wmi|3OjcAz;XkQU+q@- z-aP!qVY6G|&*x=-^RZoR@>y;60ex0GYO6K6Q9}^4dFqWRZMhMHwzoB``+PCjf9M5DSDi=`GKox0WDl?v^J{THtD{yVoGgd~T-e=m%OJxBWw z_8w}(9AJK|2+fb@#AYzu`vsi;-n}p{%oniFgXKmS^hq74Z%fBJr~|gSZ6m7@I1kzn zr&d{)qsK$%Loh7F<=geO2%H;1``n1Yxk1{##B+nS*}7=&Em<)-cAcQ%PB7T(Z4H99 zP|9y>t2Ookw>--YA%@rC_p0C(zhAckfz%Zd)-5mOtQB$+TIQTguEP(aBH$Bdi_eSa zALRt{A@>TNyZfKEAf8*?47eF^GvH>x&48N$Hv?`4+zdRv3^>o4`MEJZ55>|+hMu)U zz0UpeF%Q2EApW+oSwh!g?&rtBBF}9Os5Dyd%w<=Hq;-SJjnB%=*bLnQTDf}DE&nf&c^GSsdFwZMI z#k{QW?=rU(evbSC`f|) zlJ;x~ij|;P35u1VSP63X-HC)HO(>2Wj*mjUFT=gA`TSMeZX-n zuE8$ac3bAAoLA;gPICnMasGbxlcuautw#d3a{bcc^@m)0v+H$@T0{I^CGHWNOU^Hk m|0!UOb{b-T-~T2x&NeZu%6TPz4Jtd0ALu2I50wE*Q~V3w3`%VP literal 0 HcmV?d00001 diff --git a/test/run.sh b/test/run.sh index a189d18a..0a0a2f72 100755 --- a/test/run.sh +++ b/test/run.sh @@ -22,45 +22,49 @@ # THE SOFTWARE. ################################################################################ -test_bin_dflt=./test/ctrl - # paths to ROC profiler and oher libraries export LD_LIBRARY_PATH=$PWD -# enable error messages logging to '/tmp/rocprofiler_log.txt' -export ROCPROFILER_LOG=1 - # ROC profiler library loaded by HSA runtime export HSA_TOOLS_LIB=librocprofiler64.so -# tool library loaded by ROC profiler -export ROCP_TOOL_LIB=libtool.so +# enable error messages logging to '/tmp/rocprofiler_log.txt' +export ROCPROFILER_LOG=1 # ROC profiler metrics config file unset ROCP_PROXY_QUEUE # ROC profiler metrics config file export ROCP_METRICS=metrics.xml +# test trace +export ROC_TEST_TRACE=1 + +# tool library loaded by ROC profiler +export ROCP_TOOL_LIB=./test/libintercept_test.so +../bin/run_tool.sh ./test/ctrl + +unset ROCP_TOOL_LIB +eval ./test/standalone_test + +# tool library loaded by ROC profiler +export ROCP_TOOL_LIB=libtool.so # ROC profiler kernels timing export ROCP_TIMESTAMP_ON=1 # output directory for the tool library, for metrics results file 'results.txt' +# and SQTT trace files 'thread_trace.se.out' export ROCP_OUTPUT_DIR=./RESULTS if [ ! -e $ROCP_TOOL_LIB ] ; then export ROCP_TOOL_LIB=test/libtool.so fi -if [ -n "$1" ] ; then - tbin="$*" -else - tbin=$test_bin_dflt -fi +export ROCP_KITER=1 +export ROCP_DITER=4 +export ROCP_INPUT=input1.xml +eval ./test/ctrl -export ROCP_KITER=100 -export ROCP_DITER=100 +export ROCP_KITER=50 +export ROCP_DITER=50 +export ROCP_AGENTS=1 +export ROCP_THRS=1 export ROCP_INPUT=input.xml -eval $tbin - -#export ROCP_KITER=1 -#export ROCP_DITER=4 -#export ROCP_INPUT=input1.xml -#eval $tbin +eval ./test/ctrl #valgrind --leak-check=full $tbin #valgrind --tool=massif $tbin diff --git a/test/tool/metrics.xml b/test/tool/metrics.xml index 6ee5c1d6..4011c131 100644 --- a/test/tool/metrics.xml +++ b/test/tool/metrics.xml @@ -1,49 +1,39 @@ #include "gfx_metrics.xml" - # average for 16 instances - - - - # sum for 16 instances - - - - - - - - - # FETCH_SIZE, kilobytes - # The total kilobytes fetched from the video memory. This is measured with all extra fetches and any cache or memory effects taken into account. - - # WRITE_SIZE, kilobytes - # The total kilobytes written to the video memory. This is measured with all extra fetches and any cache or memory effects taken into account. - + + + + + + + + + + + + + + - # average for 16 instances - - - - # sum for 16 instances - - - - - - - - - - - # FETCH_SIZE, kilobytes - # The total kilobytes fetched from the video memory. This is measured with all extra fetches and any cache or memory effects taken into account. - - # WRITE_SIZE, kilobytes - # The total kilobytes written to the video memory. This is measured with all extra fetches and any cache or memory effects taken into account. - + + + + + + + + + + + + + + + + diff --git a/test/tool/tool.cpp b/test/tool/tool.cpp index 373f1f7b..ac9b7a28 100644 --- a/test/tool/tool.cpp +++ b/test/tool/tool.cpp @@ -417,7 +417,8 @@ bool dump_context_entry(context_entry_t* entry) { index, entry->data.queue_index, nik_name.c_str()); - if (record) fprintf(file_handle, ", time(%lu,%lu,%lu,%lu)", + if (record) fprintf(file_handle, ", gpu-id(%u), time(%lu,%lu,%lu,%lu)", + HsaRsrcFactory::Instance().GetAgentInfo(entry->agent)->dev_index, record->dispatch, record->begin, record->end, @@ -1019,10 +1020,10 @@ extern "C" PUBLIC_API void OnUnloadTool() { // Dump stored profiling output data fflush(stdout); if (result_file_opened) { - printf("\nROCPRofiler: %u contexts collected", context_collected); fflush(stdout); + printf("\nROCPRofiler:"); fflush(stdout); dump_context_array(NULL); fclose(result_file_handle); - printf(", output directory %s\n", result_prefix); + printf(" %u contexts collected, output directory %s\n", context_collected, result_prefix); } else { if (context_collected != context_count) { results_output_break(); diff --git a/test/util/hsa_rsrc_factory.cpp b/test/util/hsa_rsrc_factory.cpp index 5404608b..0293c6c4 100644 --- a/test/util/hsa_rsrc_factory.cpp +++ b/test/util/hsa_rsrc_factory.cpp @@ -128,13 +128,13 @@ HsaRsrcFactory::HsaRsrcFactory(bool initialize_hsa) : initialize_hsa_(initialize #ifdef ROCP_LD_AQLPROFILE status = LoadAqlProfileLib(&aqlprofile_api_); #else - status = hsa_system_get_extension_table(HSA_EXTENSION_AMD_AQLPROFILE, 1, 0, &aqlprofile_api_); + status = hsa_system_get_major_extension_table(HSA_EXTENSION_AMD_AQLPROFILE, hsa_ven_amd_aqlprofile_VERSION_MAJOR, sizeof(aqlprofile_api_), &aqlprofile_api_); #endif CHECK_STATUS("aqlprofile API table load failed", status); // Get Loader API table loader_api_ = {0}; - status = hsa_system_get_extension_table(HSA_EXTENSION_AMD_LOADER, 1, 0, &loader_api_); + status = hsa_system_get_major_extension_table(HSA_EXTENSION_AMD_LOADER, 1, sizeof(loader_api_), &loader_api_); CHECK_STATUS("loader API table query failed", status); // Instantiate HSA timer @@ -520,6 +520,7 @@ bool HsaRsrcFactory::LoadAndFinalize(const AgentInfo* agent_info, const char* br // Print the various fields of Hsa Gpu Agents bool HsaRsrcFactory::PrintGpuAgents(const std::string& header) { + std::cout << std::flush; std::clog << header << " :" << std::endl; const AgentInfo* agent_info; @@ -543,7 +544,7 @@ bool HsaRsrcFactory::PrintGpuAgents(const std::string& header) { } uint64_t HsaRsrcFactory::Submit(hsa_queue_t* queue, const void* packet) { - const uint32_t slot_size_b = 0x40; + const uint32_t slot_size_b = CMD_SLOT_SIZE_B; // adevance command queue const uint64_t write_idx = hsa_queue_load_write_index_relaxed(queue); @@ -571,7 +572,7 @@ uint64_t HsaRsrcFactory::Submit(hsa_queue_t* queue, const void* packet) { } uint64_t HsaRsrcFactory::Submit(hsa_queue_t* queue, const void* packet, size_t size_bytes) { - const uint32_t slot_size_b = 0x40; + const uint32_t slot_size_b = CMD_SLOT_SIZE_B; if ((size_bytes & (slot_size_b - 1)) != 0) { fprintf(stderr, "HsaRsrcFactory::Submit: Bad packet size %zx\n", size_bytes); abort(); diff --git a/test/util/hsa_rsrc_factory.h b/test/util/hsa_rsrc_factory.h index c9466f89..738a8e2f 100644 --- a/test/util/hsa_rsrc_factory.h +++ b/test/util/hsa_rsrc_factory.h @@ -121,7 +121,7 @@ struct AgentInfo { // HSA timer class // Provides current HSA timestampa and system-clock/ns conversion API class HsaTimer { - public: + public: typedef uint64_t timestamp_t; static const timestamp_t TIMESTAMP_MAX = UINT64_MAX; typedef long double freq_t; @@ -134,8 +134,12 @@ class HsaTimer { } // Methids for system-clock/ns conversion - timestamp_t sysclock_to_ns(const timestamp_t& sysclock) const { return timestamp_t((freq_t)sysclock * sysclock_factor_); } - timestamp_t ns_to_sysclock(const timestamp_t& time) const { return timestamp_t((freq_t)time / sysclock_factor_); } + timestamp_t sysclock_to_ns(const timestamp_t& sysclock) const { + return timestamp_t((freq_t)sysclock * sysclock_factor_); + } + timestamp_t ns_to_sysclock(const timestamp_t& time) const { + return timestamp_t((freq_t)time / sysclock_factor_); + } // Return timestamp in 'ns' timestamp_t timestamp_ns() const { @@ -145,13 +149,14 @@ class HsaTimer { return sysclock_to_ns(sysclock); } - private: + private: // Timestamp frequency factor freq_t sysclock_factor_; }; class HsaRsrcFactory { public: + static const size_t CMD_SLOT_SIZE_B = 0x40; typedef std::recursive_mutex mutex_t; typedef HsaTimer::timestamp_t timestamp_t; @@ -270,7 +275,7 @@ class HsaRsrcFactory { static uint64_t Submit(hsa_queue_t* queue, const void* packet, size_t size_bytes); // Return AqlProfile API table - typedef hsa_ven_amd_aqlprofile_1_00_pfn_t aqlprofile_pfn_t; + typedef hsa_ven_amd_aqlprofile_pfn_t aqlprofile_pfn_t; const aqlprofile_pfn_t* AqlProfileApi() const { return &aqlprofile_api_; } // Return Loader API table From 1047b900d4fb22540ec0a133092b800e5d5c6308 Mon Sep 17 00:00:00 2001 From: Evgeny Date: Mon, 19 Nov 2018 19:14:18 -0600 Subject: [PATCH 022/153] pre/post package scripts install dir name fix and CPACK_PACKAGING_INSTALL_PREFIX --- CMakeLists.txt | 2 ++ DEBIAN/postinst | 2 +- DEBIAN/prerm | 2 +- RPM/rpm_post | 2 +- RPM/rpm_postun | 2 +- bin/rpl_run.sh | 11 +++++------ bin/tblextr.py | 4 ++-- test/tool/tool.cpp | 7 ++++--- 8 files changed, 17 insertions(+), 15 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index c8d473d7..05ee1800 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -77,6 +77,7 @@ add_subdirectory ( ${TEST_DIR} ${PROJECT_BINARY_DIR}/test ) #if ( CMAKE_INSTALL_PREFIX_INITIALIZED_TO_DEFAULT ) #message ( "CMAKE default prefix: ${CMAKE_INSTALL_PREFIX}" ) #endif () +set ( CPACK_PACKAGING_INSTALL_PREFIX ${CMAKE_INSTALL_PREFIX} ) set ( CMAKE_INSTALL_PREFIX "${CMAKE_INSTALL_PREFIX}/${ROCPROFILER_NAME}" ) message ( "---------Install-Dir: ${CMAKE_INSTALL_PREFIX}" ) install ( TARGETS ${ROCPROFILER_TARGET} LIBRARY DESTINATION lib ) @@ -99,6 +100,7 @@ install ( FILES ${PROJECT_BINARY_DIR}/test/ctrl DESTINATION tool PERMISSIONS OWNER_READ OWNER_EXECUTE GROUP_READ GROUP_EXECUTE WORLD_READ WORLD_EXECUTE ) ## Packaging directives +set ( CPACK_GENERATOR "DEB" "RPM" "TGZ" ) set ( CPACK_PACKAGE_NAME "${ROCPROFILER_NAME}-dev" ) set ( CPACK_PACKAGE_VENDOR "AMD" ) set ( CPACK_PACKAGE_VERSION_MAJOR ${BUILD_VERSION_MAJOR} ) diff --git a/DEBIAN/postinst b/DEBIAN/postinst index 3d022884..abec93b9 100644 --- a/DEBIAN/postinst +++ b/DEBIAN/postinst @@ -3,7 +3,7 @@ set -e do_ldconfig() { - echo /opt/rocm/librocprofiler/lib > /etc/ld.so.conf.d/libhsa-rocprofiler64.conf && ldconfig + echo /opt/rocm/rocprofiler/lib > /etc/ld.so.conf.d/librocprofiler64.conf && ldconfig } case "$1" in diff --git a/DEBIAN/prerm b/DEBIAN/prerm index b3f509a9..40946383 100644 --- a/DEBIAN/prerm +++ b/DEBIAN/prerm @@ -3,7 +3,7 @@ set -e rm_ldconfig() { - rm -f /etc/ld.so.conf.d/libhsa-rocprofiler64.conf && ldconfig + rm -f /etc/ld.so.conf.d/librocprofiler64.conf && ldconfig } case "$1" in diff --git a/RPM/rpm_post b/RPM/rpm_post index 57c5c811..d0684561 100644 --- a/RPM/rpm_post +++ b/RPM/rpm_post @@ -1 +1 @@ -echo /opt/rocm/librocprofiler/lib > /etc/ld.so.conf.d/libhsa-rocprofiler64.conf && ldconfig +echo /opt/rocm/rocprofiler/lib > /etc/ld.so.conf.d/librocprofiler64.conf && ldconfig diff --git a/RPM/rpm_postun b/RPM/rpm_postun index 6b3c8f28..b9c1fadb 100644 --- a/RPM/rpm_postun +++ b/RPM/rpm_postun @@ -1 +1 @@ -rm -f /etc/ld.so.conf.d/libhsa-rocprofiler64.conf && ldconfig +rm -f /etc/ld.so.conf.d/librocprofiler64.conf && ldconfig diff --git a/bin/rpl_run.sh b/bin/rpl_run.sh index 043c0007..78a03fa1 100755 --- a/bin/rpl_run.sh +++ b/bin/rpl_run.sh @@ -27,7 +27,6 @@ BIN_DIR=`dirname $0` BIN_DIR=`cd $BIN_DIR; pwd` RUN_DIR=`pwd` TMP_DIR="/tmp" -DATA_PATH=$TMP_DIR DATA_DIR="rpl_data_${time_stamp}_$$" PKG_DIR=`echo $BIN_DIR | sed "s/\/bin\/*//"` @@ -195,6 +194,7 @@ if [ -z "$1" ] ; then fi INPUT_FILE="" +DATA_PATH="-" OUTPUT_DIR="-" output="" csv_output="" @@ -210,8 +210,7 @@ while [ 1 ] ; do elif [ "$1" = "-o" ] ; then output="$2" elif [ "$1" = "-d" ] ; then - OUTPUT_DIR="$2" - DATA_PATH=$OUTPUT_DIR + DATA_PATH=$2 elif [ "$1" = "-t" ] ; then TMP_DIR="$2" if [ "$OUTPUT_DIR" = "-" ] ; then @@ -268,8 +267,8 @@ else input_base=`basename $input_base` fi -if [ "$OUTPUT_DIR" = "--" ] ; then - fatal "Bad output dir '$OUTPUT_DIR'" +if [ "$DATA_PATH" = "-" ] ; then + DATA_PATH=$TMP_DIR fi if [ -n "$output" ] ; then @@ -290,9 +289,9 @@ echo "RPL: input file '$INPUT_FILE'" input_list="" RES_DIR="" if [ "$input_type" = "xml" ] ; then + OUTPUT_DIR=$DATA_PATH input_list=$INPUT_FILE elif [ "$input_type" = "txt" -o "$input_type" = "none" ] ; then - OUTPUT_DIR="-" RES_DIR=$DATA_PATH/$DATA_DIR if [ -e $RES_DIR ] ; then error "Rundir '$RES_DIR' exists" diff --git a/bin/tblextr.py b/bin/tblextr.py index 6a0f8eb2..87ecadd5 100755 --- a/bin/tblextr.py +++ b/bin/tblextr.py @@ -45,7 +45,7 @@ def parse_res(infile): if not os.path.isfile(infile): fatal("Error: input file '" + infile + "' not found") inp = open(infile, 'r') - beg_pattern = re.compile("^dispatch\[(\d*)\], queue_index\(\d*\), kernel_name\(\"([^\"]*)\"\)") + beg_pattern = re.compile("^dispatch\[(\d*)\],.* kernel-name\(\"([^\"]*)\"\)") ts_pattern = re.compile(", time\((\d*),(\d*),(\d*),(\d*)\)") var_pattern = re.compile("^\s*([^\s]*)\s+\((\d*)\)") @@ -55,7 +55,7 @@ def parse_res(infile): m = var_pattern.match(record) if m: - if not dispatch_number in var_table: fatal("Error: dispatch number not unique '" + str(dispatch_number) + "'") + if not dispatch_number in var_table: fatal("Error: dispatch number not found '" + str(dispatch_number) + "'") var = m.group(1) val = m.group(2) var_table[dispatch_number][m.group(1)] = m.group(2) diff --git a/test/tool/tool.cpp b/test/tool/tool.cpp index ac9b7a28..0db9e290 100644 --- a/test/tool/tool.cpp +++ b/test/tool/tool.cpp @@ -413,12 +413,13 @@ bool dump_context_entry(context_entry_t* entry) { FILE* file_handle = entry->file_handle; const std::string nik_name = (to_truncate_names == 0) ? entry->data.kernel_name : filtr_kernel_name(entry->data.kernel_name); - fprintf(file_handle, "dispatch[%u], queue_index(%lu), kernel_name(\"%s\")", + fprintf(file_handle, "dispatch[%u], gpu-id(%u), queue-id(%u), queue-index(%lu), kernel-name(\"%s\")", index, + HsaRsrcFactory::Instance().GetAgentInfo(entry->agent)->dev_index, + entry->data.queue_id, entry->data.queue_index, nik_name.c_str()); - if (record) fprintf(file_handle, ", gpu-id(%u), time(%lu,%lu,%lu,%lu)", - HsaRsrcFactory::Instance().GetAgentInfo(entry->agent)->dev_index, + if (record) fprintf(file_handle, ", time(%lu,%lu,%lu,%lu)", record->dispatch, record->begin, record->end, From 569e980a06ba858bfb9ab59db21717135a5d06f3 Mon Sep 17 00:00:00 2001 From: Evgeny Date: Mon, 4 Feb 2019 19:45:46 -0600 Subject: [PATCH 023/153] adding kernel properties --- CMakeLists.txt | 20 +- bin/dform.py | 36 +++ bin/rpl_run.sh | 81 ++++++- bin/run_tool.sh | 36 +++ bin/sqlitedb.py | 228 ++++++++++++++++++ bin/tblextr.py | 350 +++++++++++++++++++++++++-- bin/txt2xml.sh | 9 +- inc/rocprofiler.h | 8 +- script/rpl_run.sh | 377 ------------------------------ script/txt2xml.sh | 94 -------- src/core/intercept_queue.h | 16 +- src/core/rocprofiler.cpp | 21 +- src/core/tracker.h | 103 +++++--- src/util/hsa_rsrc_factory.h | 2 +- test/app/intercept_test_stand.cpp | 189 +++++++++++++++ test/run.sh | 2 + test/tool/gfx_metrics.xml | 4 +- test/tool/metrics.xml | 4 +- test/tool/tool.cpp | 172 +++++++------- 19 files changed, 1092 insertions(+), 660 deletions(-) create mode 100644 bin/dform.py create mode 100755 bin/run_tool.sh create mode 100644 bin/sqlitedb.py delete mode 100755 script/rpl_run.sh delete mode 100755 script/txt2xml.sh create mode 100644 test/app/intercept_test_stand.cpp diff --git a/CMakeLists.txt b/CMakeLists.txt index 05ee1800..18bbee13 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -73,11 +73,14 @@ endif () ## Build tests add_subdirectory ( ${TEST_DIR} ${PROJECT_BINARY_DIR}/test ) -## Install information -#if ( CMAKE_INSTALL_PREFIX_INITIALIZED_TO_DEFAULT ) -#message ( "CMAKE default prefix: ${CMAKE_INSTALL_PREFIX}" ) -#endif () -set ( CPACK_PACKAGING_INSTALL_PREFIX ${CMAKE_INSTALL_PREFIX} ) +## Create symlinks for packaging and install +add_custom_target ( rocprof-link ALL WORKING_DIRECTORY ${PROJECT_BINARY_DIR} + COMMAND ${CMAKE_COMMAND} -E create_symlink ../${ROCPROFILER_NAME}/bin/rpl_run.sh rocprof-link ) +add_custom_target ( inc-link ALL WORKING_DIRECTORY ${PROJECT_BINARY_DIR} + COMMAND ${CMAKE_COMMAND} -E create_symlink ../${ROCPROFILER_NAME}/include inc-link ) +add_custom_target ( so-link ALL WORKING_DIRECTORY ${PROJECT_BINARY_DIR} + COMMAND ${CMAKE_COMMAND} -E create_symlink ../${ROCPROFILER_NAME}/lib/${ROCPROFILER_LIBRARY}.so so-link ) + set ( CMAKE_INSTALL_PREFIX "${CMAKE_INSTALL_PREFIX}/${ROCPROFILER_NAME}" ) message ( "---------Install-Dir: ${CMAKE_INSTALL_PREFIX}" ) install ( TARGETS ${ROCPROFILER_TARGET} LIBRARY DESTINATION lib ) @@ -87,8 +90,15 @@ install ( FILES ${CMAKE_CURRENT_SOURCE_DIR}/bin/rpl_run.sh ${CMAKE_CURRENT_SOURCE_DIR}/bin/txt2xml.sh ${CMAKE_CURRENT_SOURCE_DIR}/bin/tblextr.py + ${CMAKE_CURRENT_SOURCE_DIR}/bin/dform.py + ${CMAKE_CURRENT_SOURCE_DIR}/bin/sqlitedb.py DESTINATION bin PERMISSIONS OWNER_READ OWNER_EXECUTE GROUP_READ GROUP_EXECUTE WORLD_READ WORLD_EXECUTE ) +install ( FILES ${PROJECT_BINARY_DIR}/inc-link DESTINATION ../include RENAME ${ROCPROFILER_NAME} ) +install ( FILES ${PROJECT_BINARY_DIR}/so-link DESTINATION ../lib RENAME ${ROCPROFILER_LIBRARY}.so ) +install ( FILES ${PROJECT_BINARY_DIR}/rocprof-link DESTINATION ../bin + PERMISSIONS OWNER_READ OWNER_EXECUTE GROUP_READ GROUP_EXECUTE WORLD_READ WORLD_EXECUTE + RENAME rocprof ) # gfx_metrics.xml metrics.xml install ( FILES ${CMAKE_CURRENT_SOURCE_DIR}/test/tool/metrics.xml diff --git a/bin/dform.py b/bin/dform.py new file mode 100644 index 00000000..5fc8d6fc --- /dev/null +++ b/bin/dform.py @@ -0,0 +1,36 @@ +#!/usr/bin/python +from sqlitedb import SQLiteDB + +def post_process_data(db, table_name, outfile = ''): +# db.add_data_column('A', 'DispDurNs', 'INTEGER', 'BeginNs - DispatchNs') +# db.add_data_column('A', 'ComplDurNs', 'INTEGER', 'CompleteNs - EndNs') +# db.add_data_column('A', 'TotalDurNs', 'INTEGER', 'CompleteNs - DispatchNs') +# db.add_data_column(table_name, 'TimeNs', 'INTEGER', 'BeginNs - %d' % start_ns) + db.add_data_column(table_name, 'DurationNs', 'INTEGER', 'EndNs - BeginNs') + if outfile != '': db.dump_csv(table_name, outfile) + +def gen_data_bins(db, outfile): + db.execute('create view C as select Name, Calls, TotalDurationNs, TotalDurationNs/Calls as AverageNs, TotalDurationNs*100.0/(select sum(TotalDurationNs) from %s) as Percentage from %s order by TotalDurationNs desc;' % ('B', 'B')); + db.dump_csv('C', outfile) + db.execute('DROP VIEW C') + +def gen_table_bins(db, table, outfile, name_var, dur_ns_var): + db.execute('create view B as select (%s) as Name, count(%s) as Calls, sum(%s) as TotalDurationNs from %s group by %s' % (name_var, name_var, dur_ns_var, table, name_var)) + gen_data_bins(db, outfile) + db.execute('DROP VIEW B') + +def gen_api_json_trace(db, table, start_us, outfile): + db.execute('create view B as select "Index", Name as name, pid, tid, (BeginNs/1000 - %d) as ts, (DurationNs/1000) as dur from %s order by ts asc;' % (start_us, table)); + db.dump_json('B', table, outfile) + db.execute('DROP VIEW B') + +def gen_ops_json_trace(db, table, base_pid, start_us, outfile): + db.execute('create view B as select "Index", Name as name, ("gpu-id" + %d) as pid, tid, (BeginNs/1000 - %d) as ts, (DurationNs/1000) as dur from %s order by ts asc;' % (base_pid, start_us, table)); + db.dump_json('B', table, outfile) + db.execute('DROP VIEW B') + +def gen_kernel_json_trace(db, table, base_pid, start_us, outfile): + db.execute('create view B as select "Index", KernelName as name, ("gpu-id" + %d) as pid, (0) as tid, (BeginNs/1000 - %d) as ts, (DurationNs/1000) as dur from %s order by ts asc;' % (base_pid, start_us, table)); + db.dump_json('B', table, outfile) + db.execute('DROP VIEW B') +############################################################################################## diff --git a/bin/rpl_run.sh b/bin/rpl_run.sh index 78a03fa1..adefad73 100755 --- a/bin/rpl_run.sh +++ b/bin/rpl_run.sh @@ -23,19 +23,27 @@ ################################################################################ time_stamp=`date +%y%m%d_%H%M%S` -BIN_DIR=`dirname $0` -BIN_DIR=`cd $BIN_DIR; pwd` +BIN_DIR=$(dirname $(realpath $0)) +PKG_DIR=$(dirname $BIN_DIR) +ROOT_DIR=$(dirname $PKG_DIR) RUN_DIR=`pwd` TMP_DIR="/tmp" DATA_DIR="rpl_data_${time_stamp}_$$" -PKG_DIR=`echo $BIN_DIR | sed "s/\/bin\/*//"` -BIN_DIR=$PKG_DIR/bin - # PATH to custom HSA and OpenCl runtimes HSA_PATH=$PKG_DIR/lib/hsa -export LD_LIBRARY_PATH=$PKG_DIR/lib:$PKG_DIR/tool:$HSA_PATH +# roctracer path +if [ -z "$ROCTRACER_PATH" ] ; then ROCTRACER_PATH=$ROOT_DIR/roctracer; fi + +# runtime API trace +HSA_TRACE=0 +HIP_TRACE=0 + +# Generate stats +GEN_STATS=0 + +export LD_LIBRARY_PATH=$PKG_DIR/lib:$PKG_DIR/tool:$ROCTRACER_PATH/lib:$ROCTRACER_PATH/tool:$HSA_PATH export PATH=.:$PATH # enable error logging @@ -128,9 +136,14 @@ usage() { echo "" echo " --basenames - to turn on/off truncating of the kernel full function names till the base ones [off]" echo " --timestamp - to turn on/off the kernel disoatches timestamps, dispatch/begin/end/complete [off]" + echo " --ctx-wait - to wait for outstanding contexts on profiler exit [on]" echo " --ctx-limit - maximum number of outstanding contexts [0 - unlimited]" echo " --heartbeat - to print progress heartbeats [0 - disabled]" echo "" + echo " --stats - generating kernel executino stats" + echo " --hsa-trace - to trace HSA, generates API execution stats and JSON file viewable in chrome tracing" + echo " --hip-trace - to trace HIP, generates API execution stats and JSON file viewable in chrome tracing" + echo "" echo "Configuration file:" echo " You can set your parameters defaults preferences in the configuration file 'rpl_rc.xml'. The search path sequence: .:${HOME}:" echo " First the configuration file is looking in the current directory, then in your home, and then in the package directory." @@ -178,12 +191,36 @@ run() { mkdir -p "$ROCP_OUTPUT_DIR" fi + API_TRACE="" + PRELOAD_LIBS="" + if [ "$HSA_TRACE" = 1 ] ; then + API_TRACE="hsa" + fi + if [ "$HIP_TRACE" = 1 ] ; then + if [ -z "$API_TRACE" ] ; then + API_TRACE="hip"; + else + API_TRACE="all" + fi + if [ -z "$HCC_HOME" ] ; then error "env var HCC_HOME is not defined"; fi + PRELOAD_LIBS="$PRELOAD_LIBS $HCC_HOME/lib/libmcwamp_hsa.so" + fi + if [ -n "$API_TRACE" ] ; then + API_TRACE=$(echo $API_TRACE | sed 's/all//') + if [ -n "$API_TRACE" ] ; then export ROCTRACER_DOMAIN=$API_TRACE; fi + export HSA_TOOLS_LIB="libtracer_tool.so libroctracer64.so $HSA_TOOLS_LIB" + PRELOAD_LIBS="$PRELOAD_LIBS $HSA_TOOLS_LIB" + fi + + redirection_cmd="" if [ -n "$ROCP_OUTPUT_DIR" ] ; then OUTPUT_LIST="$OUTPUT_LIST $ROCP_OUTPUT_DIR/results.txt" - eval "$APP_CMD 2>&1 | tee $ROCP_OUTPUT_DIR/log.txt" - else - eval "$APP_CMD" + redirection_cmd="2>&1 | tee $ROCP_OUTPUT_DIR/log.txt" fi + + #unset ROCP_OUTPUT_DIR + CMD_LINE="LD_PRELOAD='$PRELOAD_LIBS' $APP_CMD $redirection_cmd" + eval "$CMD_LINE" } # main @@ -236,10 +273,29 @@ while [ 1 ] ; do else export ROCP_TIMESTAMP_ON=0 fi + elif [ "$1" = "--ctx-wait" ] ; then + if [ "$2" = "on" ] ; then + export ROCP_OUTSTANDING_WAIT=1 + else + export ROCP_OUTSTANDING_WAIT=0 + fi elif [ "$1" = "--ctx-limit" ] ; then export ROCP_OUTSTANDING_MAX="$2" elif [ "$1" = "--heartbeat" ] ; then export ROCP_OUTSTANDING_MON="$2" + elif [ "$1" = "--stats" ] ; then + ARG_VAL=0 + export ROCP_TIMESTAMP_ON=1 + GEN_STATS=1 + elif [ "$1" = "--hsa-trace" ] ; then + ARG_VAL=0 + export ROCP_TIMESTAMP_ON=1 + GEN_STATS=1 + HSA_TRACE=1 + elif [ "$1" = "--hip-trace" ] ; then + ARG_VAL=0 + GEN_STATS=1 + HIP_TRACE=1 elif [ "$1" = "--verbose" ] ; then ARG_VAL=0 export ROCP_VERBOSE_MODE=1 @@ -323,7 +379,12 @@ for name in $input_list; do done if [ -n "$csv_output" ] ; then - python $BIN_DIR/tblextr.py $csv_output $OUTPUT_LIST + if [ "$GEN_STATS" = "1" ] ; then + db_output=$(echo $csv_output | sed "s/\.csv/.db/") + python $BIN_DIR/tblextr.py $db_output $OUTPUT_LIST + else + python $BIN_DIR/tblextr.py $csv_output $OUTPUT_LIST + fi if [ "$?" -eq 0 ] ; then echo "RPL: '$csv_output' is generated" else diff --git a/bin/run_tool.sh b/bin/run_tool.sh new file mode 100755 index 00000000..5af6d1a1 --- /dev/null +++ b/bin/run_tool.sh @@ -0,0 +1,36 @@ +#!/bin/sh +BIN_DIR=`dirname $0` +BIN_DIR=`cd $BIN_DIR; pwd` +PKG_DIR=`echo $BIN_DIR | sed "s/\/bin\/*//"` +BIN_DIR=$PKG_DIR/bin + +# PATH to custom HSA libs +HSA_PATH=$PKG_DIR/lib/hsa + +if [ -z "$1" ] ; then + echo "Usage: $0 " +else +# profiler plugin library +test_app=$* + +# paths to ROC profiler and oher libraries +export LD_LIBRARY_PATH=$PKG_DIR/lib:$PKG_DIR/tool:$HSA_PATH +export PATH=.:$PATH + +# ROC profiler library loaded by HSA runtime +export HSA_TOOLS_LIB=librocprofiler64.so.1 +# tool library loaded by ROC profiler +if [ -z $ROCP_TOOL_LIB ] ; then + export ROCP_TOOL_LIB=libintercept_test.so +fi +# enable error messages +export HSA_TOOLS_REPORT_LOAD_FAILURE=1 +export HSA_VEN_AMD_AQLPROFILE_LOG=1 +export ROCPROFILER_LOG=1 +# to prevent internal simple proxy queue +unset ROCP_PROXY_QUEUE +# ROC profiler metrics config file +export ROCP_METRICS=$BIN_DIR/lib/metrics.xml + +LD_PRELOAD=$ROCP_TOOL_LIB $test_app +fi diff --git a/bin/sqlitedb.py b/bin/sqlitedb.py new file mode 100644 index 00000000..295fe7a7 --- /dev/null +++ b/bin/sqlitedb.py @@ -0,0 +1,228 @@ +import csv, sqlite3, re, sys +from functools import reduce + +# SQLite Database class +class SQLiteDB: + def __init__(self, file_name): + self.connection = sqlite3.connect(file_name) + self.tables = {} + self.json_arg_list_enabled = 0 + + def __del__(self): + self.connection.close() + + # add DB table + def add_table(self, name, descr, extra = ()): + (field_list, field_dict) = descr + if name in self.tables: raise Exception('table is already added: "' + name + '"') + + # create DB table + table_descr = [] + for field in field_list: table_descr.append('"%s" %s' % (field, field_dict[field])) + for item in extra: table_descr.append('"%s" %s' % (item[0], item[1])) + stm = 'CREATE TABLE ' + name + ' (%s)' % ', '.join(table_descr) + cursor = self.connection.cursor() + cursor.execute(stm) + self.connection.commit() + + # register table + fields_str = ','.join(map(lambda x: '"' + x + '"', field_list)) + templ_str = ','.join('?' * len(field_list)) + stm = 'INSERT INTO ' + name + '(' + fields_str + ') VALUES(' + templ_str + ');' + self.tables[name] = stm + + return (cursor, stm); + + # add columns to table + def add_columns(self, name, columns): + cursor = self.connection.cursor() + for item in columns: + stm = 'ALTER TABLE ' + name + ' ADD COLUMN "%s" %s' % (item[0], item[1]) + cursor.execute(stm) + self.connection.commit() + + # add columns with expression + def add_data_column(self, table_name, data_label, data_type, data_expr): + cursor = self.connection.cursor() + cursor.execute('ALTER TABLE %s ADD COLUMN "%s" %s' % (table_name, data_label, data_type)) + cursor.execute('UPDATE %s SET %s = (%s);' % (table_name, data_label, data_expr)) + + # populate DB table entry + def insert_entry(self, table, val_list): + (cursor, stm) = table + cursor.execute(stm, val_list) + + # populate DB table entry + def commit_entry(self, table, val_list): + self.insert_entry(table, val_list) + self.connection.commit() + + # populate DB table data + def insert_table(self, table, reader): + for val_list in reader: + if not val_list[-1]: val_list.pop() + self.insert_entry(table, val_list) + self.connection.commit() + + # return table fields list + def _get_fields(self, table_name): + cursor = self.connection.execute('SELECT * FROM ' + table_name) + return list(map(lambda x: '"%s"' % (x[0]), cursor.description)) + + # return table raws list + def _get_raws(self, table_name): + cursor = self.connection.execute('SELECT * FROM ' + table_name) + return cursor.fetchall() + def _get_raws_indexed(self, table_name): + cursor = self.connection.execute('SELECT * FROM ' + table_name + ' order by "Index" asc;') + return cursor.fetchall() + def _get_raw_by_id(self, table_name, req_id): + cursor = self.connection.execute('SELECT * FROM ' + table_name + ' WHERE "Index"=?', (req_id,)) + raws = cursor.fetchall() + if len(raws) != 1: + raise Exception('Index is not unique, table "' + table_name + '"') + return list(raws[0]) + + # dump CSV table + def dump_csv(self, table_name, file_name): + if not re.search(r'\.csv$', file_name): + raise Exception('wrong output file type: "' + file_name + '"' ) + + fields = self._get_fields(table_name) + with open(file_name, mode='w') as fd: + fd.write(','.join(fields) + '\n') + for raw in self._get_raws(table_name): + fd.write(reduce(lambda a, b: str(a) + ',' + str(b), raw) + '\n') + + # dump JSON trace + def open_json(self, file_name): + if not re.search(r'\.json$', file_name): + raise Exception('wrong output file type: "' + file_name + '"' ) + with open(file_name, mode='w') as fd: + fd.write('{ "traceEvents":[{}\n'); + + def close_json(self, file_name): + if not re.search(r'\.json$', file_name): + raise Exception('wrong output file type: "' + file_name + '"' ) + with open(file_name, mode='a') as fd: + fd.write(']}\n'); + + def label_json(self, pid, label, file_name): + if not re.search(r'\.json$', file_name): + raise Exception('wrong output file type: "' + file_name + '"' ) + with open(file_name, mode='a') as fd: + fd.write(',{"args":{"name":"%s"},"ph":"M","pid":%s,"name":"process_name"}\n' %(label, pid)); + + def flow_json(self, base_id, from_pid, from_tid, from_us_list, to_pid, to_us_dict, corr_id_list, start_us, file_name): + if not re.search(r'\.json$', file_name): + raise Exception('wrong output file type: "' + file_name + '"' ) + with open(file_name, mode='a') as fd: + dep_id = base_id + for ind in range(len(from_tid)): + if (len(corr_id_list) != 0): corr_id = corr_id_list[ind] + else: corr_id = ind + from_ts = from_us_list[ind] - start_us + to_ts = to_us_dict[corr_id] - start_us + if from_ts > to_ts: from_ts = to_ts + fd.write(',{"ts":%d,"ph":"s","cat":"DataFlow","id":%d,"pid":%s,"tid":%s,"name":"dep"}\n' % (from_ts, dep_id, str(from_pid), from_tid[ind])) + fd.write(',{"ts":%d,"ph":"t","cat":"DataFlow","id":%d,"pid":%s,"tid":0,"name":"dep"}\n' % (to_ts, dep_id, str(to_pid))) + dep_id += 1 + + def dump_json(self, table_name, data_name, file_name): + if not re.search(r'\.json$', file_name): + raise Exception('wrong output file type: "' + file_name + '"' ) + + sub_ptrn = re.compile(r'(^"|"$)') + name_ptrn = re.compile(r'(name|Name)') + + table_fields = self._get_fields(table_name) + table_raws = self._get_raws_indexed(table_name) + data_fields = self._get_fields(data_name) + data_raws = self._get_raws_indexed(data_name) + + with open(file_name, mode='a') as fd: + table_raws_len = len(table_raws) + for raw_index in range(table_raws_len): + if (raw_index == table_raws_len - 1) or (raw_index % 1000 == 0): + sys.stdout.write( \ + "\rdump json " + str(raw_index) + ":" + str(len(table_raws)) + " "*100 \ + ) + + vals_list = [] + values = list(table_raws[raw_index]) + for value_index in range(len(values)): + label = table_fields[value_index] + value = values[value_index] + if name_ptrn.search(label): value = sub_ptrn.sub(r'', value) + if label != '"Index"': vals_list.append('%s:"%s"' % (label, value)) + + args_list = [] + data = list(data_raws[raw_index]) + for value_index in range(len(data)): + label = data_fields[value_index] + value = data[value_index] + if name_ptrn.search(label): value = sub_ptrn.sub(r'', value) + if label != '"Index"': args_list.append('%s:"%s"' % (label, value)) + + fd.write(',{"ph":"%s",%s,\n "args":{\n %s\n }\n}\n' % ('X', ','.join(vals_list), ',\n '.join(args_list))) + + sys.stdout.write('\n') + + # execute query on DB + def execute(self, cmd): + cursor = self.connection.cursor() + cursor.execute(cmd) + + # commit DB + def commit(self): + self.connection.commit() + + # close DB + def close(self): + self.connection.close() + + # access DB + def get_raws(self, table_name): + cur = self.connection.cursor() + cur.execute("SELECT * FROM %s" % table_name) + return cur.fetchall() + + # return CSV descriptor + # list of fields and dictionaly for the fields types + def _get_csv_descr(self, table_name, fd): + reader = csv.DictReader(fd) + field_names = reader.fieldnames + if not field_names[-1]: field_names.pop() + field_types = {} + + for entry in reader: + fields_left = [f for f in field_names if f not in field_types.keys()] + # all fields processed + if not fields_left: break + + for field in fields_left: + data = entry[field] + # need data for the field to be processed + if len(data) == 0: continue + + if data.isdigit(): + field_types[field] = "INTEGER" + else: + field_types[field] = "TEXT" + + if len(fields_left) > 0: raise Exception('types not found for fields: ', fields_left) + return (field_names, field_types) + + # add CSV table + def add_csv_table(self, table_name, file_name, extra = ()): + with open(file_name, mode='r') as fd: + # get CSV table descriptor + descr = self._get_csv_descr(table_name, fd) + # reader to populate the table + fd.seek(0) + reader = csv.reader(fd) + reader.next() + table = self.add_table(table_name, descr, extra) + self.insert_table(table, reader) + +############################################################################################## diff --git a/bin/tblextr.py b/bin/tblextr.py index 87ecadd5..4c4cc782 100755 --- a/bin/tblextr.py +++ b/bin/tblextr.py @@ -23,6 +23,8 @@ ################################################################################ import os, sys, re +from sqlitedb import SQLiteDB +import dform # Parsing results in the format: #dispatch[0], queue_index(0), kernel_name("SimpleConvolution"), time(1048928000311041,1048928006154674,1048928006168274,1048928006170503): @@ -30,8 +32,24 @@ # SQ_WAVES (4096) # SQ_INSTS_VMEM_RD (36864) +COPY_PID = 0 +OPS_PID = 1 +HSA_PID = 2 +HIP_PID = 3 +GPU_BASE_PID = 4 +max_gpu_id = -1 +START_US = 0 + +# dependencies dictionary +dep_dict = {} +kern_dep_list = [] + # global vars -var_list = ['Index', 'KernelName', 'DispatchNs', 'BeginNs', 'EndNs', 'CompleteNs'] +table_descr = [ + ['Index', 'KernelName'], + {'Index': 'INTEGER', 'KernelName': 'TEXT'} +] +var_list = table_descr[0] var_table = {} ############################################################# @@ -42,10 +60,12 @@ def fatal(msg): # parse results method def parse_res(infile): + global max_gpu_id if not os.path.isfile(infile): fatal("Error: input file '" + infile + "' not found") inp = open(infile, 'r') - beg_pattern = re.compile("^dispatch\[(\d*)\],.* kernel-name\(\"([^\"]*)\"\)") + beg_pattern = re.compile("^dispatch\[(\d*)\], (.*) kernel-name\(\"([^\"]*)\"\)") + prop_pattern = re.compile("([\w-]+)\((\w+)\)"); ts_pattern = re.compile(", time\((\d*),(\d*),(\d*),(\d*)\)") var_pattern = re.compile("^\s*([^\s]*)\s+\((\d*)\)") @@ -67,8 +87,25 @@ def parse_res(infile): if not dispatch_number in var_table: var_table[dispatch_number] = { 'Index': dispatch_number, - 'KernelName': "\"" + m.group(2) + "\"" + 'KernelName': "\"" + m.group(3) + "\"" } + + gpu_id = 0 + disp_tid = 0 + + kernel_properties = m.group(2) + for prop in kernel_properties.split(', '): + m = prop_pattern.match(prop) + if m: + var = m.group(1) + val = m.group(2) + var_table[dispatch_number][var] = val + if not var in var_list: var_list.append(var); + if var == 'gpu-id': + gpu_id = int(val) + if (gpu_id > max_gpu_id): max_gpu_id = gpu_id + if var == 'tid': disp_tid = val + else: fatal('wrong kernel property "' + prop + '" in "'+ kernel_properties + '"') m = ts_pattern.search(record) if m: var_table[dispatch_number]['DispatchNs'] = m.group(1) @@ -76,47 +113,310 @@ def parse_res(infile): var_table[dispatch_number]['EndNs'] = m.group(3) var_table[dispatch_number]['CompleteNs'] = m.group(4) + gpu_pid = GPU_BASE_PID + int(gpu_id) + if not gpu_pid in dep_dict: dep_dict[gpu_pid] = {} + dep_str = dep_dict[gpu_pid] + if not 'tid' in dep_str: dep_str['tid'] = [] + if not 'from' in dep_str: dep_str['from'] = [] + if not 'to' in dep_str: dep_str['to'] = {} + to_id = len(dep_str['tid']) + from_us = int(m.group(1)) / 1000 + to_us = int(m.group(2)) / 1000 + dep_str['to'][to_id] = to_us + dep_str['from'].append(from_us) + dep_str['tid'].append(disp_tid) + dep_str['pid'] = HSA_PID + kern_dep_list.append((disp_tid, m.group(1))) + inp.close() ############################################################# -# print results table method -def print_tbl(outfile): +# merge results table +def merge_table(): global var_list - if len(var_table) == 0: return 1 + keys = sorted(var_table.keys(), key=int) - out = open(outfile, 'w') + fields = set(var_table[keys[0]]) + if 'DispatchNs' in fields: + var_list.append('DispatchNs') + var_list.append('BeginNs') + var_list.append('EndNs') + var_list.append('CompleteNs') + var_list = [x for x in var_list if x in fields] +############################################################# - keys = var_table.keys() - keys.sort(key=int) +# dump CSV results +def dump_csv(file_name): + global var_list + keys = sorted(var_table.keys(), key=int) - entry = var_table[keys[0]] - list1 = [] - for var in var_list: - if var in entry: - list1.append(var) - var_list = list1 + with open(file_name, mode='w') as fd: + fd.write(','.join(var_list) + '\n'); + for ind in keys: + entry = var_table[ind] + dispatch_number = entry['Index'] + if ind != dispatch_number: fatal("Dispatch #" + ind + " index mismatch (" + dispatch_number + ")\n") + val_list = [entry[var] for var in var_list] + fd.write(','.join(val_list) + '\n'); +############################################################# + +# fill kernels DB +def fill_kernel_db(table_name, db): + global var_list + keys = sorted(var_table.keys(), key=int) - for var in var_list: out.write(var + ',') - out.write("\n") + for var in set(var_list).difference(set(table_descr[1])): + table_descr[1][var] = 'INTEGER' + table_descr[0] = var_list; + + table_handle = db.add_table(table_name, table_descr) for ind in keys: entry = var_table[ind] dispatch_number = entry['Index'] if ind != dispatch_number: fatal("Dispatch #" + ind + " index mismatch (" + dispatch_number + ")\n") - for var in var_list: out.write(entry[var] + ',') - out.write("\n") + val_list = [entry[var] for var in var_list] + db.insert_entry(table_handle, val_list) +############################################################# + +# fill HSA DB +hsa_table_descr = [ + ['BeginNs', 'EndNs', 'pid', 'tid', 'Name', 'args', 'Index'], + {'Index':'INTEGER', 'Name':'TEXT', 'args':'TEXT', 'BeginNs':'INTEGER', 'EndNs':'INTEGER', 'pid':'INTEGER', 'tid':'INTEGER'} +] +def fill_api_db(table_name, db, indir, api_name, api_pid, dep_pid, dep_list, dep_filtr, expl_id): + file_name = indir + '/' + api_name + '_api_trace.txt' + ptrn_val = re.compile(r'(\d+):(\d+) (\d+):(\d+) ([^\(]+)(\(.*)$') + ptrn_ac = re.compile(r'hsa_amd_memory_async_copy') + + if not os.path.isfile(file_name): return 0 + + dep_tid_list = [] + dep_from_us_list = [] + dep_id_list = [] + + global START_US + with open(file_name, mode='r') as fd: + line = fd.readline() + record = line[:-1] + m = ptrn_val.match(record) + if m: START_US = int(m.group(1)) / 1000 + START_US = 0 + + record_id = 0 + table_handle = db.add_table(table_name, hsa_table_descr) + with open(file_name, mode='r') as fd: + for line in fd.readlines(): + record = line[:-1] + m = ptrn_val.match(record) + if m: + rec_vals = [] + for ind in range(1,7): + rec_vals.append(m.group(ind)) + rec_vals[2] = api_pid + rec_vals.append(record_id) + db.insert_entry(table_handle, rec_vals) + if ptrn_ac.search(rec_vals[4]) or record_id in dep_filtr: + beg_ns = int(rec_vals[0]) + end_ns = int(rec_vals[1]) + from_us = (beg_ns / 1000) + ((end_ns - beg_ns) / 1000) + dep_from_us_list.append(from_us) + dep_tid_list.append(int(rec_vals[3])) + dep_id_list.append(record_id) + record_id += 1 + else: fatal("hsa bad record") + + for (tid, from_ns) in dep_list: + db.insert_entry(table_handle, [from_ns, from_ns, api_pid, tid, 'hsa_dispatch', '', record_id]) + record_id += 1 + + if not dep_pid in dep_dict: dep_dict[dep_pid] = {} + dep_dict[dep_pid]['pid'] = api_pid + dep_dict[dep_pid]['tid'] = dep_tid_list + dep_dict[dep_pid]['from'] = dep_from_us_list + if expl_id: dep_dict[dep_pid]['id'] = dep_id_list + + return 1 +############################################################# + +# fill COPY DB +copy_table_descr = [ + ['BeginNs', 'EndNs', 'Name', 'pid', 'tid', 'Index'], + {'Index':'INTEGER', 'Name':'TEXT', 'args':'TEXT', 'BeginNs':'INTEGER', 'EndNs':'INTEGER', 'pid':'INTEGER', 'tid':'INTEGER'} +] +def fill_copy_db(table_name, db, indir): + file_name = indir + '/' + 'async_copy_trace.txt' + ptrn_val = re.compile(r'(\d+):(\d+) (.*)$') + ptrn_id = re.compile(r'^async-copy(\d+)$') + + if not COPY_PID in dep_dict: dep_dict[COPY_PID] = {} + dep_to_us_dict = {} + + table_handle = db.add_table(table_name, copy_table_descr) + with open(file_name, mode='r') as fd: + for line in fd.readlines(): + record = line[:-1] + m = ptrn_val.match(record) + if m: + rec_vals = [] + for ind in range(1,4): rec_vals.append(m.group(ind)) + rec_vals.append(COPY_PID) + rec_vals.append(0) + m = ptrn_id.match(rec_vals[2]) + if m: dep_to_us_dict[int(m.group(1))] = int(rec_vals[0]) / 1000 + else: fatal("bad async-copy entry") + rec_vals.append(m.group(1)) + db.insert_entry(table_handle, rec_vals) + else: fatal("async-copy bad record") - out.close() - return 0 + dep_dict[COPY_PID]['to'] = dep_to_us_dict ############################################################# +# fill HCC ops DB +ops_table_descr = [ + ['BeginNs', 'EndNs', 'dev-id', 'queue-id', 'Name', 'pid', 'tid', 'Index'], + {'Index':'INTEGER', 'Name':'TEXT', 'args':'TEXT', 'BeginNs':'INTEGER', 'EndNs':'INTEGER', 'dev-id':'INTEGER', 'queue-id':'INTEGER', 'pid':'INTEGER', 'tid':'INTEGER'} +] +def fill_ops_db(table_name, db, indir): + global max_gpu_id + file_name = indir + '/' + 'hcc_ops_trace.txt' + ptrn_val = re.compile(r'(\d+):(\d+) (\d+):(\d+) (.*)$') + ptrn_id = re.compile(r'^[^:]+:(\d+)$') + + if not os.path.isfile(file_name): return {} + + filtr = {} + + record_id = 0 + table_handle = db.add_table(table_name, ops_table_descr) + with open(file_name, mode='r') as fd: + for line in fd.readlines(): + record = line[:-1] + m = ptrn_val.match(record) + if m: + rec_vals = [] + for ind in range(1,6): rec_vals.append(m.group(ind)) + gpu_id = int(rec_vals[2]); + if (gpu_id > max_gpu_id): max_gpu_id = gpu_id + gpu_pid = GPU_BASE_PID + int(gpu_id) + rec_vals.append(gpu_pid) + rec_vals.append(0) + m = ptrn_id.match(rec_vals[4]) + if not m: fatal("bad hcc ops entry '" + record + "'") + corr_id = int(m.group(1)) - 1 + rec_vals.append(corr_id) + db.insert_entry(table_handle, rec_vals) + filtr[corr_id] = 1 + + if not gpu_pid in dep_dict: dep_dict[gpu_pid] = {} + dep_dict[gpu_pid]['to'][corr_id] = int(rec_vals[0]) / 1000 + dep_dict[gpu_pid]['bsp'] = OPS_PID + else: fatal("async-copy bad record") + + return filtr +############################################################# # main if (len(sys.argv) < 3): fatal("Usage: " + sys.argv[0] + " ") outfile = sys.argv[1] infiles = sys.argv[2:] -for f in infiles: - parse_res(f) -ret = print_tbl(outfile) -sys.exit(ret) +indir = re.sub(r'\/[^\/]*$', r'', infiles[0]) + +dbfile = '' +csvfile = '' + +if re.search(r'\.csv$', outfile): + csvfile = outfile +elif re.search(r'\.db$', outfile): + dbfile = outfile + csvfile = re.sub(r'\.db$', '.csv', outfile) +else: + fatal("Bad output file '" + outfile + "'") + +for f in infiles: parse_res(f) +if len(var_table) == 0: sys.exit(1) +merge_table() + +if dbfile == '': + dump_csv(csvfile) +else: + statfile = re.sub(r'\.csv$', '.stats.csv', csvfile) + jsonfile = re.sub(r'\.csv$', '.json', csvfile) + + with open(dbfile, mode='w') as fd: fd.truncate() + db = SQLiteDB(dbfile) + + hsa_trace_found = fill_api_db('HSA', db, indir, 'hsa', HSA_PID, COPY_PID, kern_dep_list, {}, 0) + if hsa_trace_found: + fill_copy_db('COPY', db, indir) + + ops_filtr = fill_ops_db('OPS', db, indir) + hip_trace_found = fill_api_db('HIP', db, indir, 'hip', HIP_PID, OPS_PID, [], ops_filtr, 1) + + fill_kernel_db('A', db) + + any_trace_found = hsa_trace_found | hip_trace_found + if any_trace_found: + db.open_json(jsonfile) + + if hsa_trace_found: + db.label_json(HSA_PID, "CPU HSA API", jsonfile) + db.label_json(COPY_PID, "COPY", jsonfile) + + if hip_trace_found: + db.label_json(HIP_PID, "CPU HIP API", jsonfile) + + if any_trace_found and max_gpu_id >= 0: + for ind in range(0, int(max_gpu_id) + 1): + db.label_json(int(ind) + int(GPU_BASE_PID), "GPU" + str(ind), jsonfile) + + dform.post_process_data(db, 'A', csvfile) + dform.gen_table_bins(db, 'A', statfile, 'KernelName', 'DurationNs') + if hsa_trace_found and 'BeginNs' in var_list: + dform.gen_kernel_json_trace(db, 'A', GPU_BASE_PID, START_US, jsonfile) + + if hsa_trace_found: + statfile = re.sub(r'stats', r'hsa_stats', statfile) + dform.post_process_data(db, 'HSA') + dform.gen_table_bins(db, 'HSA', statfile, 'Name', 'DurationNs') + dform.gen_api_json_trace(db, 'HSA', START_US, jsonfile) + + dform.post_process_data(db, 'COPY') + dform.gen_api_json_trace(db, 'COPY', START_US, jsonfile) + + if hip_trace_found: + statfile = re.sub(r'stats', r'hip_stats', statfile) + dform.post_process_data(db, 'HIP') + dform.gen_table_bins(db, 'HIP', statfile, 'Name', 'DurationNs') + dform.gen_api_json_trace(db, 'HIP', START_US, jsonfile) + + dform.post_process_data(db, 'OPS') + dform.gen_ops_json_trace(db, 'OPS', GPU_BASE_PID, START_US, jsonfile) + + if any_trace_found: + for (to_pid, dep_str) in dep_dict.items(): + if 'bsp' in dep_str: + bspid = dep_str['bsp'] + base_str = dep_dict[bspid] + for v in ('pid', 'tid', 'from', 'id'): + dep_str[v] = base_str[v] + base_str['inv'] = 1 + + dep_id = 0 + for (to_pid, dep_str) in dep_dict.items(): + if 'inv' in dep_str: continue + from_pid = dep_str['pid'] + tid_list = dep_str['tid'] + from_us_list = dep_str['from'] + to_us_dict = dep_str['to'] + corr_id_list = [] + if 'id' in dep_str: corr_id_list = dep_str['id'] + db.flow_json(dep_id, from_pid, tid_list, from_us_list, to_pid, to_us_dict, corr_id_list, START_US, jsonfile) + dep_id += len(tid_list) + + if any_trace_found: + db.close_json(jsonfile); + db.close() + +sys.exit(0) ############################################################# diff --git a/bin/txt2xml.sh b/bin/txt2xml.sh index 66da77db..27bbe8c4 100755 --- a/bin/txt2xml.sh +++ b/bin/txt2xml.sh @@ -66,11 +66,11 @@ parse() { else output=$outdir/input${index}.xml header="# $timestamp '$output' generated with '$0 $*'" + echo $header > $output if [ "$feature" == "pmc" ] ; then line=`echo "$line" | sed -e "s/ /,/g"` cat >> $output < EOF @@ -78,9 +78,14 @@ EOF if [ "$feature" == "sqtt" ] ; then cat >> $output < +EOF + fi + + if [ "$feature" == "hsa" ] ; then + cat >> $output < EOF fi fi diff --git a/inc/rocprofiler.h b/inc/rocprofiler.h index 4448128f..6aeb26af 100644 --- a/inc/rocprofiler.h +++ b/inc/rocprofiler.h @@ -26,8 +26,8 @@ THE SOFTWARE. // // The goal of the implementation is to provide a HW specific low-level // performance analysis interface for profiling of GPU compute applications. -// The profiling includes HW performance counters with complex -// performance metrics and HW traces. +// The profiling includes HW performance counters with derived +// performance metrics. // // The library can be used by a tool library loaded by HSA runtime or by // higher level HW independent performance analysis API like PAPI. @@ -42,10 +42,11 @@ THE SOFTWARE. #define INC_ROCPROFILER_H_ #include +#include #include #include -#define ROCPROFILER_VERSION_MAJOR 5 +#define ROCPROFILER_VERSION_MAJOR 6 #define ROCPROFILER_VERSION_MINOR 0 #ifdef __cplusplus @@ -220,6 +221,7 @@ typedef struct { const hsa_kernel_dispatch_packet_t* packet; // HSA dispatch packet const char* kernel_name; // Kernel name uint64_t kernel_object; // Kernel object pointer + const amd_kernel_code_t* kernel_code; // Kernel code pointer int64_t thread_id; // Thread id const rocprofiler_dispatch_record_t* record; // Dispatch record } rocprofiler_callback_data_t; diff --git a/script/rpl_run.sh b/script/rpl_run.sh deleted file mode 100755 index a8260e77..00000000 --- a/script/rpl_run.sh +++ /dev/null @@ -1,377 +0,0 @@ -################################################################################ -# Copyright (c) 2018 Advanced Micro Devices, Inc. All rights reserved. -# -# Permission is hereby granted, free of charge, to any person obtaining a copy -# of this software and associated documentation files (the "Software"), to deal -# in the Software without restriction, including without limitation the rights -# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -# copies of the Software, and to permit persons to whom the Software is -# furnished to do so, subject to the following conditions: -# -# The above copyright notice and this permission notice shall be included in -# all copies or substantial portions of the Software. -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN -# THE SOFTWARE. -################################################################################ - -#!/bin/sh -time_stamp=`date +%y%m%d_%H%M%S` -BIN_DIR=`dirname $0` -BIN_DIR=`cd $BIN_DIR; pwd` -RUN_DIR=`pwd` -TMP_DIR="/tmp" -DATA_PATH=$TMP_DIR -DATA_DIR="rpl_data_${time_stamp}_$$" - -PKG_DIR=`echo $BIN_DIR | sed "s/\/bin\/*$//"` -BIN_DIR=$PKG_DIR/bin - -# PATH to custom HSA and OpenCl runtimes -HSA_PATH=$PKG_DIR/lib/hsa - -export LD_LIBRARY_PATH=$PKG_DIR/lib:$PKG_DIR/tool:$HSA_PATH -export PATH=.:$PATH - -# enable error logging -export HSA_TOOLS_REPORT_LOAD_FAILURE=1 -export HSA_VEN_AMD_AQLPROFILE_LOG=1 -export ROCPROFILER_LOG=1 - -# ROC Profiler environment -# Loading of ROC Profiler by HSA runtime -export HSA_TOOLS_LIB=librocprofiler64.so -# Loading of the test tool by ROC Profiler -export ROCP_TOOL_LIB=libtool.so -# Enabling HSA dispatches intercepting by ROC PRofiler -export ROCP_HSA_INTERCEPT=1 -# Disabling internal ROC Profiler proxy queue (simple version supported for testing purposes) -unset ROCP_PROXY_QUEUE -# ROC Profiler metrics definition -export ROCP_METRICS=$PKG_DIR/lib/metrics.xml -# ROC Profiler package path -export ROCP_PACKAGE_DIR=$PKG_DIR - -# error handling -fatal() { - echo "$0: Error: $1" - echo "" - usage -} - -error() { - echo "$0: Error: $1" - echo "" - exit 1 -} - -# usage method -usage() { - bin_name=`basename $0` - echo "ROCm Profiling Library (RPL) run script, a part of ROCprofiler library package." - echo "Full path: $BIN_DIR/$bin_name" - echo "Metrics definition: $PKG_DIR/lib/metrics.xml" - echo "" - echo "Usage:" - echo " rpl_run.sh [-h] [--list-basic] [--list-derived] [-i ] [-o ] " - echo "" - echo "Options:" - echo " -h - this help" - echo " --verbose - verbose mode, dumping all base counters used in the input metrics" - echo " --list-basic - to print the list of basic HW counters" - echo " --list-derived - to print the list of derived metrics with formulas" - echo "" - echo " -i <.txt|.xml file> - input file" - echo " Input file .txt format, automatically rerun application for every pmc/sqtt line:" - echo "" - echo " # Perf counters group 1" - echo " pmc : Wavefronts VALUInsts SALUInsts SFetchInsts FlatVMemInsts LDSInsts FlatLDSInsts GDSInsts VALUUtilization FetchSize" - echo " # Perf counters group 2" - echo " pmc : WriteSize L2CacheHit" - echo " # SQ tread trace" - echo " sqtt : MASK = 0x0F00 TOKEN_MASK = 0x144B TOKEN_MASK2 = 0xFFFF" - echo " # Filter by dispatches range, GPU index and kernel names" - echo " # supported range formats: \"3:9\", \"3:\", \"3\"" - echo " range: 1 : 4" - echo " gpu: 0 1 2 3" - echo " kernel: simple Pass1 simpleConvolutionPass2" - echo "" - echo " Input file .xml format, for single profiling run:" - echo "" - echo " # Metrics list definition, also the form \":\" can be used" - echo " # All defined metrics can be found in the 'metrics.xml'" - echo " # There are basic metrics for raw HW counters and high-level metrics for derived counters" - echo " " - echo "" - echo " # Trace enabling and the parameters definition" - echo " " - echo " " - echo " " - echo "" - echo " # Filter by dispatches range, GPU index and kernel names" - echo " " - echo "" - echo " Supported by profiler SQTT parameters:" - echo " TARGET_CU - target Compute Unit, MASK.CU_SEL field" - echo " VM_ID_MASK - select which VM IDs to capture, MASK.VM_ID_MASK field" - echo " MASK - MASK register value" - echo " TOKEN_MASK - TOKEN_MASK register value" - echo " TOKEN_MASK2 - TOKEN_MASK2 register value, traced instructions mask" - echo " The parameters defaults:" - echo " TARGET_CU = 0;" - echo " VM_ID_MASK = 0;" - echo " MASK:" - echo " mask.bits.CU_SEL = param{TARGET_CU};" - echo " mask.bits.SH_SEL = 0x0;" - echo " mask.bits.SIMD_EN = 0xF;" - echo " mask.bits.SQ_STALL_EN = 0x1;" - echo " mask.bits.SPI_STALL_EN = 0x1;" - echo " mask.bits.REG_STALL_EN = 0x1;" - echo " mask.bits.VM_ID_MASK = param{VM_ID_MASK};" - echo " TOKEN_MASK:" - echo " token_mask.bits.TOKEN_MASK = 0xFFFF;" - echo " token_mask.bits.REG_MASK = 0xFF;" - echo " token_mask.bits.REG_DROP_ON_STALL = 0x1;" - echo " TOKEN_MASK2:" - echo " token_mask2.bits.INST_MASK = 0xFFFFFF7F; // INST_PC is disabled because its tracing can cause extra stalling" - echo " // and it is recommended to disable by SQTT user guide" - echo " HIWATER = 6; // which is 6/8 fraction of the tread trace fifo" - echo "" - echo " -o - output CSV file [.csv]" - echo " -d - directory where profiler store profiling data including thread treaces [/tmp]" - echo " The data directory is renoving autonatically if the directory is matching the temporary one, which is the default." - echo " -t - to change the temporary directory [/tmp]" - echo " By changing the temporary directory you can prevent removing the profiling data from /tmp or enable removing from not '/tmp' directory." - echo "" - echo " --basenames - to turn on/off truncating of the kernel full function names till the base ones [off]" - echo " --timestamp - to turn on/off the kernel disoatches timestamps, dispatch/begin/end/complete [off]" - echo " --ctx-limit - maximum number of outstanding contexts [0 - unlimited]" - echo " --heartbeat - to turn on/off the kernel disoatches timestamps, dispatch/begin/end/complete [0 - disabled]" - echo " --sqtt-size - to set SQTT buffer size, aggregate for all SE [0x2000000]" - echo " Can be set in KB (1024B) or MB (1048576) units, examples 20K or 20M respectively." - echo " --sqtt-local - to allocate SQTT buffer in local GPU memory [on]" - echo "" - echo "Configuration file:" - echo " You can set your parameters defaults preferences in the configuration file 'rpl_rc.xml'. The search path sequence: .:${HOME}:" - echo" First the configuration file is looking in the current directory, then in your home, and then in the package directory." - echo " Configurable options: 'basenames', 'timestamp', 'ctx-limit', 'heartbeat', 'sqtt-size', 'sqtt-local'." - echo " An example of 'rpl_rc.xml':" - echo " " - echo "" - exit 1 -} - -# profiling run method -OUTPUT_LIST="" -run() { - export ROCP_INPUT="$1" - OUTPUT_DIR="$2" - shift - shift - APP_CMD=$* - - if [ "$OUTPUT_DIR" = "-" ] ; then - input_tag=`echo $ROCP_INPUT | sed "s/\.xml//"` - export ROCP_OUTPUT_DIR=${input_tag}_results_${time_stamp} - elif [ "$OUTPUT_DIR" = "--" ] ; then - unset ROCP_OUTPUT_DIR - else - export ROCP_OUTPUT_DIR=$OUTPUT_DIR - fi - echo "RPL: result dir '$ROCP_OUTPUT_DIR'" - - if [ ! -e "$ROCP_INPUT" ] ; then - error "Input file '$ROCP_INPUT' not found" - fi - - if [ -n "$ROCP_OUTPUT_DIR" ] ; then - if [ "$OUTPUT_DIR" = "-" ] ; then - if [ -e "$ROCP_OUTPUT_DIR" ] ; then - error "generated dir '$ROCP_OUTPUT_DIR' exists" - fi - fi - mkdir -p "$ROCP_OUTPUT_DIR" - fi - - if [ -n "$ROCP_OUTPUT_DIR" ] ; then - OUTPUT_LIST="$OUTPUT_LIST $ROCP_OUTPUT_DIR/results.txt" - eval "$APP_CMD 2>&1 | tee $ROCP_OUTPUT_DIR/log.txt" - else - eval "$APP_CMD" - fi -} - -# main -echo "RPL: on '$time_stamp' from '$PKG_DIR' at '$RUN_DIR'" -# Parsing arguments -if [ -z "$1" ] ; then - usage -fi - -INPUT_FILE="" -OUTPUT_DIR="-" -output="" -csv_output="" - -ARG_IN="" -while [ 1 ] ; do - ARG_IN=$1 - ARG_VAL=1 - if [ "$1" = "-h" ] ; then - usage - elif [ "$1" = "-i" ] ; then - INPUT_FILE="$2" - elif [ "$1" = "-o" ] ; then - output="$2" - elif [ "$1" = "-d" ] ; then - OUTPUT_DIR="$2" - DATA_PATH=$OUTPUT_DIR - elif [ "$1" = "-t" ] ; then - TMP_DIR="$2" - if [ "$OUTPUT_DIR" = "-" ] ; then - DATA_PATH=$TMP_DIR - fi - elif [ "$1" = "--list-basic" ] ; then - export ROCP_INFO=b - eval "$PKG_DIR/test/SimpleConvolution" - exit 1 - elif [ "$1" = "--list-derived" ] ; then - export ROCP_INFO=d - eval "$PKG_DIR/test/SimpleConvolution" - exit 1 - elif [ "$1" = "--basenames" ] ; then - if [ "$2" = "on" ] ; then - export ROCP_TRUNCATE_NAMES=1 - else - export ROCP_TRUNCATE_NAMES=0 - fi - elif [ "$1" = "--timestamp" ] ; then - if [ "$2" = "on" ] ; then - export ROCP_TRACKER_ON=1 - else - export ROCP_TRACKER_ON=0 - fi - elif [ "$1" = "--ctx-limit" ] ; then - export ROCP_OUTSTANDING_MAX="$2" - elif [ "$1" = "--heartbeat" ] ; then - export ROCP_OUTSTANDING_MON="$2" - elif [ "$1" = "--sqtt-size" ] ; then - size_m=`echo "$2" | sed -n "s/^\(.*\)M$/\1/p"` - size_k=`echo "$2" | sed -n "s/^\(.*\)K$/\1/p"` - if [ -n "$size_m" ] ; then size_b=$((size_m*1024*1024)) - elif [ -n "$size_k" ] ; then size_b=$((size_k*1024)) - else size_b=$2 - fi - export ROCP_SQTT_SIZE=$size_b - elif [ "$1" = "--sqtt-local" ] ; then - if [ "$2" = "on" ] ; then - export ROCP_SQTT_LOCAL=1 - else - export ROCP_SQTT_LOCAL=0 - fi - elif [ "$1" = "--verbose" ] ; then - ARG_VAL=0 - export ROCP_VERBOSE_MODE=1 - else - break - fi - shift - if [ "$ARG_VAL" = 1 ] ; then shift; fi -done - -ARG_CK=`echo $ARG_IN | sed "s/^-.*$/-/"` -if [ "$ARG_CK" = "-" ] ; then - fatal "Wrong option '$ARG_IN'" -fi - -if [ -z "$INPUT_FILE" ] ; then - fatal "Need input file" -fi - -input_base=`echo "$INPUT_FILE" | sed "s/^\(.*\)\.\([^\.]*\)$/\1/"` -input_type=`echo "$INPUT_FILE" | sed "s/^\(.*\)\.\([^\.]*\)$/\2/"` -if [ -z "${input_base}" -o -z "${input_type}" ] ; then - fatal "Bad input file '$INPUT_FILE'" -fi -input_base=`basename $input_base` - -if [ "$OUTPUT_DIR" = "--" ] ; then - fatal "Bad output dir '$OUTPUT_DIR'" -fi - -if [ -n "$output" ] ; then - if [ "$output" = "--" ] ; then - OUTPUT_DIR="--" - else - csv_output=$output - fi -else - csv_output=$RUN_DIR/${input_base}.csv -fi - -APP_CMD=$* - -echo "RPL: profiling '$APP_CMD'" -echo "RPL: input file '$INPUT_FILE'" - -input_list="" -RES_DIR="" -if [ "$input_type" = "xml" ] ; then - input_list=$INPUT_FILE -elif [ "$input_type" = "txt" ] ; then - OUTPUT_DIR="-" - RES_DIR=$DATA_PATH/$DATA_DIR - if [ -e $RES_DIR ] ; then - error "Rundir '$RES_DIR' exists" - fi - mkdir -p $RES_DIR - echo "RPL: output dir '$RES_DIR'" - $BIN_DIR/txt2xml.sh $INPUT_FILE $RES_DIR - input_list=`/bin/ls $RES_DIR/input*.xml` -else - fatal "Bad input file type '$INPUT_FILE'" -fi - -for name in $input_list; do - run $name $OUTPUT_DIR $APP_CMD -done - -if [ -n "$csv_output" ] ; then - python $BIN_DIR/tblextr.py $csv_output $OUTPUT_LIST - if [ "$?" = 1 ] ; then - error "CSV generation error, profiling results '$RES_DIR'" - fi - echo "RPL: '$csv_output' is generated" -fi - -if [ "$DATA_PATH" = "$TMP_DIR" ] ; then - if [ -e "$RES_DIR" ] ; then - rm -rf $RES_DIR - fi -fi - -exit 0 diff --git a/script/txt2xml.sh b/script/txt2xml.sh deleted file mode 100755 index 57cb4be7..00000000 --- a/script/txt2xml.sh +++ /dev/null @@ -1,94 +0,0 @@ -################################################################################ -# Copyright (c) 2018 Advanced Micro Devices, Inc. All rights reserved. -# -# Permission is hereby granted, free of charge, to any person obtaining a copy -# of this software and associated documentation files (the "Software"), to deal -# in the Software without restriction, including without limitation the rights -# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -# copies of the Software, and to permit persons to whom the Software is -# furnished to do so, subject to the following conditions: -# -# The above copyright notice and this permission notice shall be included in -# all copies or substantial portions of the Software. -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN -# THE SOFTWARE. -################################################################################ - -#!/bin/bash -timestamp=`date +%y%m%d_%H%M%S` - -if [ $# = 0 ] ; then - echo "Usage: $0 [output dir]" - exit -1 -fi - -input=$1 -outdir=$2 -if [ -z "$outdir" ] ; then - outdir="." -fi - -range="" -kernel="" -gpu_index="" - -parse() { - scan="$1" - index=0 - while read -r line ; do - line=`echo $line | sed "s/\s*#.*$//"` - if [ -z "$line" ] ; then - continue - fi - - feature=`echo $line | sed -n "s/^\s*\([a-z]*\)\s*:.*$/\1/p"` - line=`echo $line | sed "s/^[^:]*:\s*//"` - line=`echo "$line" | sed -e "s/\s*=\s*/=/g" -e "s/\s*:\s*/:/g" -e "s/,\{1,\}/ /g" -e "s/\s\{1,\}/ /g" -e "s/\s*$//"` - - if [ "$scan" = 0 ] ; then - line=`echo "$line" | sed -e "s/ /,/g"` - if [ "$feature" == "range" ] ; then - range=$line - fi - if [ "$feature" == "kernel" ] ; then - kernel=$line - fi - if [ "$feature" == "gpu" ] ; then - gpu_index=$line - fi - else - output=$outdir/input${index}.xml - header="# $timestamp '$output' generated with '$0 $*'" - - if [ "$feature" == "pmc" ] ; then - line=`echo "$line" | sed -e "s/ /,/g"` - cat >> $output < - -EOF - fi - - if [ "$feature" == "sqtt" ] ; then - cat >> $output < - -EOF - fi - fi - - index=$((index + 1)) - done < $input -} - -parse 0 -parse 1 - -exit 0 diff --git a/src/core/intercept_queue.h b/src/core/intercept_queue.h index 1f31b0d9..e41dcd0f 100644 --- a/src/core/intercept_queue.h +++ b/src/core/intercept_queue.h @@ -71,7 +71,7 @@ class InterceptQueue { if (status != HSA_STATUS_SUCCESS) EXC_ABORT(status, "ProxyQueue::Create()"); if (tracker_on || tracker_on_) { - if (tracker_ == NULL) tracker_ = new Tracker; + if (tracker_ == NULL) tracker_ = &Tracker::Instance(); status = hsa_amd_profiling_set_profiler_enabled(*queue, true); if (status != HSA_STATUS_SUCCESS) EXC_ABORT(status, "hsa_amd_profiling_set_profiler_enabled()"); } @@ -110,7 +110,7 @@ class InterceptQueue { static hsa_status_t QueueDestroy(hsa_queue_t* queue) { std::lock_guard lck(mutex_); - hsa_status_t status = HSA_STATUS_ERROR; + hsa_status_t status = HSA_STATUS_SUCCESS; if (destroy_callback_ != NULL) { status = destroy_callback_(queue, callback_data_); @@ -147,7 +147,8 @@ class InterceptQueue { } // Prepareing dispatch callback data - uint64_t kernel_symbol = GetKernelSymbol(dispatch_packet); + const amd_kernel_code_t* kernel_code = GetKernelCode(dispatch_packet); + const uint64_t kernel_symbol = kernel_code->runtime_loader_kernel_symbol; const char* kernel_name = GetKernelName(kernel_symbol); rocprofiler_callback_data_t data = {obj->agent_info_->dev_id, obj->agent_info_->dev_index, @@ -157,6 +158,7 @@ class InterceptQueue { dispatch_packet, kernel_name, kernel_symbol, + kernel_code, syscall(__NR_gettid), (tracker_entry) ? tracker_entry->record : NULL}; @@ -177,7 +179,7 @@ class InterceptQueue { if (tracker_entry != NULL) { Group* context_group = context->GetGroup(group.index); context_group->IncrRefsCount(); - tracker_->Enable(tracker_entry, Context::Handler, reinterpret_cast(context_group)); + tracker_->EnableContext(tracker_entry, Context::Handler, reinterpret_cast(context_group)); } const pkt_vector_t& start_vector = context->StartPackets(group.index); @@ -195,7 +197,7 @@ class InterceptQueue { if (tracker_entry != NULL) { void* context_handler_arg = NULL; rocprofiler_handler_t context_handler_fun = context->GetHandler(&context_handler_arg); - tracker_->Enable(tracker_entry, context_handler_fun, context_handler_arg); + tracker_->EnableDispatch(tracker_entry, context_handler_fun, context_handler_arg); } } } @@ -239,7 +241,7 @@ class InterceptQueue { return static_cast((*header >> HSA_PACKET_HEADER_TYPE) & header_type_mask); } - static uint64_t GetKernelSymbol(const hsa_kernel_dispatch_packet_t* dispatch_packet) { + static const amd_kernel_code_t* GetKernelCode(const hsa_kernel_dispatch_packet_t* dispatch_packet) { const amd_kernel_code_t* kernel_code = NULL; hsa_status_t status = util::HsaRsrcFactory::Instance().LoaderApi()->hsa_ven_amd_loader_query_host_address( @@ -248,7 +250,7 @@ class InterceptQueue { if (HSA_STATUS_SUCCESS != status) { kernel_code = reinterpret_cast(dispatch_packet->kernel_object); } - return kernel_code->runtime_loader_kernel_symbol; + return kernel_code; } static const char* GetKernelName(const uint64_t kernel_symbol) { diff --git a/src/core/rocprofiler.cpp b/src/core/rocprofiler.cpp index 6042e59e..c3c4bd0c 100644 --- a/src/core/rocprofiler.cpp +++ b/src/core/rocprofiler.cpp @@ -136,12 +136,15 @@ void * tool_handle = NULL; // Load profiling tool library // Return true if intercepting mode is enabled -bool LoadTool() { - bool intercept_mode = false; +enum { + DISPATCH_INTERCEPT_MODE = 0x1 +}; +uint32_t LoadTool() { + uint32_t intercept_mode = 0; const char* tool_lib = getenv("ROCP_TOOL_LIB"); if (tool_lib) { - intercept_mode = true; + intercept_mode = DISPATCH_INTERCEPT_MODE; tool_handle = dlopen(tool_lib, RTLD_NOW); if (tool_handle == NULL) { @@ -164,7 +167,7 @@ bool LoadTool() { } rocprofiler_settings_t settings{}; - settings.intercept_mode = (intercept_mode) ? 1 : 0; + settings.intercept_mode = (intercept_mode != 0) ? 1 : 0; settings.sqtt_size = SqttProfile::GetSize(); settings.sqtt_local = SqttProfile::IsLocal() ? 1: 0; settings.timeout = util::HsaRsrcFactory::GetTimeoutNs(); @@ -173,11 +176,11 @@ bool LoadTool() { if (handler) handler(); else if (handler_prop) handler_prop(&settings); - intercept_mode = (settings.intercept_mode != 0); SqttProfile::SetSize(settings.sqtt_size); SqttProfile::SetLocal(settings.sqtt_local != 0); util::HsaRsrcFactory::SetTimeoutNs(settings.timeout); InterceptQueue::TrackerOn(settings.timestamp_on != 0); + if (settings.intercept_mode != 0) intercept_mode = DISPATCH_INTERCEPT_MODE; } return intercept_mode; @@ -313,6 +316,9 @@ hsa_status_t CreateQueuePro( rocprofiler_properties_t rocprofiler_properties; uint32_t SqttProfile::output_buffer_size_ = 0x2000000; // 32M bool SqttProfile::output_buffer_local_ = true; +Tracker* Tracker::instance_ = NULL; +Tracker::mutex_t Tracker::glob_mutex_; +Tracker::counter_t Tracker::counter_ = 0; util::Logger::mutex_t util::Logger::mutex_; util::Logger* util::Logger::instance_ = NULL; } @@ -355,8 +361,8 @@ PUBLIC_API bool OnLoad(HsaApiTable* table, uint64_t runtime_version, uint64_t fa } // Loading a tool lib and setting of intercept mode - const bool intercept_mode_on = rocprofiler::LoadTool(); - if (intercept_mode_on) intercept_mode = true; + const uint32_t intercept_mode_mask = rocprofiler::LoadTool(); + if (intercept_mode_mask & rocprofiler::DISPATCH_INTERCEPT_MODE) intercept_mode = true; // HSA intercepting if (intercept_mode) { @@ -371,6 +377,7 @@ PUBLIC_API bool OnLoad(HsaApiTable* table, uint64_t runtime_version, uint64_t fa // HSA-runtime tool on-unload method PUBLIC_API void OnUnload() { + rocprofiler::Tracker::Destroy(); rocprofiler::UnloadTool(); rocprofiler::RestoreHsaApi(); } diff --git a/src/core/tracker.h b/src/core/tracker.h index ab7f3b5d..0cada86f 100644 --- a/src/core/tracker.h +++ b/src/core/tracker.h @@ -47,8 +47,10 @@ class Tracker { struct entry_t; typedef std::list sig_list_t; typedef sig_list_t::iterator sig_list_it_t; + typedef uint64_t counter_t; struct entry_t { + counter_t index; std::atomic valid; Tracker* tracker; sig_list_t::iterator it; @@ -58,22 +60,25 @@ class Tracker { record_t* record; std::atomic handler; void* arg; - bool context_active; + bool is_context; + bool is_memcopy; }; - Tracker() : - outstanding_(0), - hsa_rsrc_(&(util::HsaRsrcFactory::Instance())) - {} + static Tracker* Create() { + std::lock_guard lck(glob_mutex_); + if (instance_ == NULL) instance_ = new Tracker; + return instance_; + } - ~Tracker() { - auto it = sig_list_.begin(); - auto end = sig_list_.end(); - while (it != end) { - auto cur = it++; - hsa_rsrc_->SignalWait((*cur)->signal); - Erase(cur); - } + static Tracker& Instance() { + if (instance_ == NULL) instance_ = Create(); + return *instance_; + } + + static void Destroy() { + std::lock_guard lck(glob_mutex_); + if (instance_ != NULL) delete instance_; + instance_ = NULL; } // Add tracker entry @@ -102,6 +107,7 @@ class Tracker { // Adding antry to the list mutex_.lock(); entry->it = sig_list_.insert(sig_list_.end(), entry); + entry->index = counter_++; mutex_.unlock(); return entry; @@ -130,20 +136,39 @@ class Tracker { } } - void Enable(entry_t* entry, hsa_amd_signal_handler handler, void* arg) { - entry->context_active = true; + void EnableContext(entry_t* entry, hsa_amd_signal_handler handler, void* arg) { + entry->is_context = true; + Enable(entry, reinterpret_cast(handler), arg); + } + void EnableDispatch(entry_t* entry, rocprofiler_handler_t handler, void* arg) { Enable(entry, reinterpret_cast(handler), arg); } - void Enable(entry_t* entry, rocprofiler_handler_t handler, void* arg) { + void EnableMemcopy(entry_t* entry, hsa_amd_signal_handler handler, void* arg) { + entry->is_memcopy = true; Enable(entry, reinterpret_cast(handler), arg); } private: + Tracker() : + outstanding_(0), + hsa_rsrc_(&(util::HsaRsrcFactory::Instance())) + {} + + ~Tracker() { + auto it = sig_list_.begin(); + auto end = sig_list_.end(); + while (it != end) { + auto cur = it++; + hsa_rsrc_->SignalWait((*cur)->signal); + Erase(cur); + } + } + // Delete an entry by iterator void Erase(const sig_list_it_t& it) { Delete(*it); } // Entry completion - inline void Complete(entry_t* entry) { + inline void Complete(hsa_signal_value_t signal_value, entry_t* entry) { record_t* record = entry->record; // Debug trace @@ -154,12 +179,20 @@ class Tracker { } // Query begin/end and complete timestamps - hsa_amd_profiling_dispatch_time_t dispatch_time{}; - hsa_status_t status = hsa_amd_profiling_get_dispatch_time(entry->agent, entry->signal, &dispatch_time); - if (status != HSA_STATUS_SUCCESS) EXC_RAISING(status, "hsa_amd_profiling_get_dispatch_time"); + if (entry->is_memcopy) { + hsa_amd_profiling_async_copy_time_t async_copy_time{}; + hsa_status_t status = hsa_amd_profiling_get_async_copy_time(entry->signal, &async_copy_time); + if (status != HSA_STATUS_SUCCESS) EXC_RAISING(status, "hsa_amd_profiling_get_async_copy_time"); + record->begin = hsa_rsrc_->SysclockToNs(async_copy_time.start); + record->end = hsa_rsrc_->SysclockToNs(async_copy_time.end); + } else { + hsa_amd_profiling_dispatch_time_t dispatch_time{}; + hsa_status_t status = hsa_amd_profiling_get_dispatch_time(entry->agent, entry->signal, &dispatch_time); + if (status != HSA_STATUS_SUCCESS) EXC_RAISING(status, "hsa_amd_profiling_get_dispatch_time"); + record->begin = hsa_rsrc_->SysclockToNs(dispatch_time.start); + record->end = hsa_rsrc_->SysclockToNs(dispatch_time.end); + } - record->begin = hsa_rsrc_->SysclockToNs(dispatch_time.start); - record->end = hsa_rsrc_->SysclockToNs(dispatch_time.end); record->complete = hsa_rsrc_->TimestampNs(); entry->valid.store(true, std::memory_order_release); @@ -171,16 +204,17 @@ class Tracker { orig_signal_ptr->start_ts = prof_signal_ptr->start_ts; orig_signal_ptr->end_ts = prof_signal_ptr->end_ts; - const hsa_signal_value_t value = hsa_signal_load_relaxed(orig); - hsa_signal_store_screlease(orig, value - 1); + const hsa_signal_value_t new_value = hsa_signal_load_relaxed(orig) - 1; + if (signal_value != new_value) EXC_ABORT(HSA_STATUS_ERROR, "Tracker::Complete bad signal value"); + hsa_signal_store_screlease(orig, signal_value); } } - inline static void HandleEntry(entry_t* entry) { + inline static void HandleEntry(hsa_signal_value_t signal_value, entry_t* entry) { // Call entry handler void* handler = static_cast(entry->handler); - if (entry->context_active) { - reinterpret_cast(handler)(0, entry->arg); + if (entry->is_context || entry->is_memcopy) { + reinterpret_cast(handler)(signal_value, entry->arg); } else { rocprofiler_group_t group{}; reinterpret_cast(handler)(group, entry->arg); @@ -190,7 +224,7 @@ class Tracker { } // Handler for packet completion - static bool Handler(hsa_signal_value_t, void* arg) { + static bool Handler(hsa_signal_value_t signal_value, void* arg) { // Acquire entry entry_t* entry = reinterpret_cast(arg); volatile std::atomic* ptr = &entry->handler; @@ -198,10 +232,10 @@ class Tracker { // Complete entry Tracker* tracker = entry->tracker; - tracker->Complete(entry); + tracker->Complete(signal_value, entry); if (ordering_enabled_ == false) { - HandleEntry(entry); + HandleEntry(signal_value, entry); } else { // Acquire last entry entry_t* back = tracker->sig_list_.back(); @@ -214,7 +248,7 @@ class Tracker { while (it != end) { entry = *(it++); if (entry->valid.load(std::memory_order_acquire)) { - HandleEntry(entry); + HandleEntry(signal_value, entry); } else { break; } @@ -225,6 +259,11 @@ class Tracker { return false; } + // instance + static Tracker* instance_; + static mutex_t glob_mutex_; + static counter_t counter_; + // Tracked signals list sig_list_t sig_list_; // Inter-thread synchronization @@ -235,7 +274,7 @@ class Tracker { // HSA resources factory util::HsaRsrcFactory* hsa_rsrc_; // Handling ordering enabled - static const bool ordering_enabled_ = true; + static const bool ordering_enabled_ = false; // Enable tracing static const bool trace_on_ = false; }; diff --git a/src/util/hsa_rsrc_factory.h b/src/util/hsa_rsrc_factory.h index 9997a81c..b3f3cf0d 100644 --- a/src/util/hsa_rsrc_factory.h +++ b/src/util/hsa_rsrc_factory.h @@ -135,7 +135,7 @@ class HsaTimer { sysclock_factor_ = (freq_t)1000000000 / (freq_t)sysclock_hz; } - // Methids for system-clock/ns conversion + // Methods for system-clock/ns conversion timestamp_t sysclock_to_ns(const timestamp_t& sysclock) const { return timestamp_t((freq_t)sysclock * sysclock_factor_); } diff --git a/test/app/intercept_test_stand.cpp b/test/app/intercept_test_stand.cpp new file mode 100644 index 00000000..7e6298e7 --- /dev/null +++ b/test/app/intercept_test_stand.cpp @@ -0,0 +1,189 @@ +/****************************************************************************** +Copyright (c) 2018 Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*******************************************************************************/ + +#include +#include +#include +#include +#include +#include + +#include "ctrl/run_kernel.h" +#include "ctrl/test_aql.h" +#include "ctrl/test_hsa.h" +#include "inc/rocprofiler.h" +#include "dummy_kernel/dummy_kernel.h" +#include "simple_convolution/simple_convolution.h" +#include "util/test_assert.h" + +// Dispatch callbacks and context handlers synchronization +pthread_mutex_t mutex = PTHREAD_RECURSIVE_MUTEX_INITIALIZER_NP; + +// Error handler +void fatal(const std::string msg) { + fflush(stdout); + fprintf(stderr, "%s\n\n", msg.c_str()); + fflush(stderr); + abort(); +} + +// Check returned HSA API status +void check_status(hsa_status_t status) { + if (status != HSA_STATUS_SUCCESS) { + const char* error_string = NULL; + rocprofiler_error_string(&error_string); + fprintf(stderr, "ERROR: %s\n", error_string); + abort(); + } +} + +// Context stored entry type +struct context_entry_t { + bool valid; + hsa_agent_t agent; + rocprofiler_group_t group; + rocprofiler_callback_data_t data; +}; + +// Dump stored context entry +void dump_context_entry(context_entry_t* entry) { + volatile std::atomic* valid = reinterpret_cast*>(&entry->valid); + while (valid->load() == false) sched_yield(); + + const std::string kernel_name = entry->data.kernel_name; + const rocprofiler_dispatch_record_t* record = entry->data.record; + + fflush(stdout); + fprintf(stdout, "kernel symbol(0x%lx) name(\"%s\")", entry->data.kernel_object, kernel_name.c_str()); + if (record) fprintf(stdout, ", gpu-id(%u), time(%lu,%lu,%lu,%lu)", + HsaRsrcFactory::Instance().GetAgentInfo(entry->agent)->dev_index, + record->dispatch, + record->begin, + record->end, + record->complete); + fprintf(stdout, "\n"); + fflush(stdout); + + rocprofiler_group_t& group = entry->group; + if (group.context == NULL) { + fprintf(stderr, "tool error: context is NULL\n"); + abort(); + } + + rocprofiler_close(group.context); +} + +// Profiling completion handler +// Dump and delete the context entry +// Return true if the context was dumped successfully +bool context_handler(rocprofiler_group_t group, void* arg) { + context_entry_t* entry = reinterpret_cast(arg); + + if (pthread_mutex_lock(&mutex) != 0) { + perror("pthread_mutex_lock"); + abort(); + } + + dump_context_entry(entry); + delete entry; + + if (pthread_mutex_unlock(&mutex) != 0) { + perror("pthread_mutex_unlock"); + abort(); + } + + return false; +} + +// Kernel disoatch callback +hsa_status_t dispatch_callback(const rocprofiler_callback_data_t* callback_data, void* /*user_data*/, + rocprofiler_group_t* group) { + // HSA status + hsa_status_t status = HSA_STATUS_ERROR; + + // Profiling context + rocprofiler_t* context = NULL; + + // Context entry + context_entry_t* entry = new context_entry_t(); + + // context properties + rocprofiler_properties_t properties{}; + properties.handler = context_handler; + properties.handler_arg = (void*)entry; + + // Open profiling context + status = rocprofiler_open(callback_data->agent, NULL, 0, + &context, 0 /*ROCPROFILER_MODE_SINGLEGROUP*/, &properties); + check_status(status); + + // Get group[0] + status = rocprofiler_get_group(context, 0, group); + check_status(status); + + // Fill profiling context entry + entry->agent = callback_data->agent; + entry->group = *group; + entry->data = *callback_data; + entry->data.kernel_name = strdup(callback_data->kernel_name); + reinterpret_cast*>(&entry->valid)->store(true); + + return HSA_STATUS_SUCCESS; +} + +int main() { + bool ret_val = false; + const char* kiter_s = getenv("ROCP_KITER"); + const char* diter_s = getenv("ROCP_DITER"); + const unsigned kiter = (kiter_s != NULL) ? atol(kiter_s) : 1; + const unsigned diter = (diter_s != NULL) ? atol(diter_s) : 1; + + // Instantiate HSA resources + HsaRsrcFactory::Create(); + + // Getting GPU device info + const AgentInfo* agent_info = NULL; + if (HsaRsrcFactory::Instance().GetGpuAgentInfo(0, &agent_info) == false) abort(); + + // Creating the queue + hsa_queue_t* queue = NULL; + if (HsaRsrcFactory::Instance().CreateQueue(agent_info, 128, &queue) == false) abort(); + + // Adding dispatch observer + rocprofiler_queue_callbacks_t callbacks_ptrs{}; + callbacks_ptrs.dispatch = dispatch_callback; + rocprofiler_set_queue_callbacks(callbacks_ptrs, NULL); + + // Test initialization + TestHsa::SetQueue(queue); + TestHsa::HsaInstantiate(0); + + for (unsigned ind = 0; ind < kiter; ++ind) { + printf("Iteration %u:\n", ind); + ret_val = RunKernel(0, NULL, diter); + if (ret_val) ret_val = RunKernel(0, NULL, diter); + } + + TestHsa::HsaShutdown(); + + return (ret_val) ? 0 : 1; +} diff --git a/test/run.sh b/test/run.sh index 0a0a2f72..550ad5b1 100755 --- a/test/run.sh +++ b/test/run.sh @@ -22,6 +22,8 @@ # THE SOFTWARE. ################################################################################ +# enable tools load failure reporting +export HSA_TOOLS_REPORT_LOAD_FAILURE=1 # paths to ROC profiler and oher libraries export LD_LIBRARY_PATH=$PWD # ROC profiler library loaded by HSA runtime diff --git a/test/tool/gfx_metrics.xml b/test/tool/gfx_metrics.xml index 9e4f24fc..fecfe7b9 100644 --- a/test/tool/gfx_metrics.xml +++ b/test/tool/gfx_metrics.xml @@ -29,7 +29,7 @@ - + @@ -65,5 +65,5 @@ - + diff --git a/test/tool/metrics.xml b/test/tool/metrics.xml index 4011c131..0b53b72e 100644 --- a/test/tool/metrics.xml +++ b/test/tool/metrics.xml @@ -167,7 +167,7 @@ # WriteUnitStalled The percentage of GPUTime the Write unit is stalled. Value range: 0% to 100% (bad). @@ -177,7 +177,7 @@ expr=100*TCC_WRREQ_STALL_max/GRBM_GUI_ACTIVE > - # The percentage of GPUTime ALU units are stalled by the LDS input queue being full or the output queue being not ready. If there are LDS bank conflicts, reduce them. Otherwise, try reducing the number of LDS accesses if possible. Value range: 0% (optimal) to 100% (bad). + # ALUStalledByLDS The percentage of GPUTime ALU units are stalled by the LDS input queue being full or the output queue being not ready. If there are LDS bank conflicts, reduce them. Otherwise, try reducing the number of LDS accesses if possible. Value range: 0% (optimal) to 100% (bad). * range; }; +// kernel properties structure +struct kernel_properties_t { + uint32_t grid_size; + uint32_t workgroup_size; + uint32_t lds_size; + uint32_t scratch_size; + uint32_t vgpr_count; + uint32_t sgpr_count; + uint32_t fbarrier_count; + hsa_signal_t signal; +}; + // Context stored entry type struct context_entry_t { bool valid; @@ -79,6 +91,7 @@ struct context_entry_t { rocprofiler_feature_t* features; unsigned feature_count; rocprofiler_callback_data_t data; + kernel_properties_t kernel_properties; FILE* file_handle; }; @@ -100,7 +113,7 @@ context_array_t* context_array = NULL; // Contexts collected count volatile uint32_t context_count = 0; volatile uint32_t context_collected = 0; -// Profiling results output file name +// Profiling results output dir const char* result_prefix = NULL; // Global results file handle FILE* result_file_handle = NULL; @@ -116,6 +129,7 @@ std::vector* kernel_string_vec = NULL; // DIspatch number range filter std::vector* range_vec = NULL; // Otstanding dispatches parameters +static uint32_t CTX_OUTSTANDING_WAIT = 1; static uint32_t CTX_OUTSTANDING_MAX = 0; static uint32_t CTX_OUTSTANDING_MON = 0; // to truncate kernel names @@ -413,11 +427,20 @@ bool dump_context_entry(context_entry_t* entry) { FILE* file_handle = entry->file_handle; const std::string nik_name = (to_truncate_names == 0) ? entry->data.kernel_name : filtr_kernel_name(entry->data.kernel_name); - fprintf(file_handle, "dispatch[%u], gpu-id(%u), queue-id(%u), queue-index(%lu), kernel-name(\"%s\")", + fprintf(file_handle, "dispatch[%u], gpu-id(%u), queue-id(%u), queue-index(%lu), tid(%lu), grd(%u), wgr(%u), lds(%u), scr(%u), vgpr(%u), sgpr(%u), fbar(%u), sig(0x%lx), kernel-name(\"%s\")", index, HsaRsrcFactory::Instance().GetAgentInfo(entry->agent)->dev_index, entry->data.queue_id, entry->data.queue_index, + entry->data.thread_id, + entry->kernel_properties.grid_size, + entry->kernel_properties.workgroup_size, + entry->kernel_properties.lds_size, + entry->kernel_properties.scratch_size, + entry->kernel_properties.vgpr_count, + entry->kernel_properties.sgpr_count, + entry->kernel_properties.fbarrier_count, + entry->kernel_properties.signal.handle, nik_name.c_str()); if (record) fprintf(file_handle, ", time(%lu,%lu,%lu,%lu)", record->dispatch, @@ -480,13 +503,13 @@ void dump_context_array(hsa_queue_t* queue) { } } } + } - if (pthread_mutex_unlock(&mutex) != 0) { - perror("pthread_mutex_unlock"); - abort(); - } - if (done == false) sched_yield(); + if (pthread_mutex_unlock(&mutex) != 0) { + perror("pthread_mutex_unlock"); + abort(); } + if (done == false) sched_yield(); } } @@ -563,6 +586,8 @@ bool check_filter(const rocprofiler_callback_data_t* callback_data, const callba hsa_status_t dispatch_callback(const rocprofiler_callback_data_t* callback_data, void* user_data, rocprofiler_group_t* group) { // Passed tool data + const hsa_kernel_dispatch_packet_t* packet = callback_data->packet; + const amd_kernel_code_t* kernel_code = callback_data->kernel_code; callbacks_data_t* tool_data = reinterpret_cast(user_data); // HSA status hsa_status_t status = HSA_STATUS_ERROR; @@ -578,6 +603,21 @@ hsa_status_t dispatch_callback(const rocprofiler_callback_data_t* callback_data, rocprofiler_t* context = NULL; // Context entry context_entry_t* entry = alloc_context_entry(); + // kernel properties + kernel_properties_t* kernel_properties_ptr = &(entry->kernel_properties); + uint64_t grid_size = packet->grid_size_x * packet->grid_size_y * packet->grid_size_z; + if (grid_size > UINT32_MAX) abort(); + kernel_properties_ptr->grid_size = (uint32_t)grid_size; + uint64_t workgroup_size = packet->workgroup_size_x * packet->workgroup_size_y * packet->workgroup_size_z; + if (workgroup_size > UINT32_MAX) abort(); + kernel_properties_ptr->workgroup_size = (uint32_t)workgroup_size; + kernel_properties_ptr->lds_size = packet->group_segment_size; + kernel_properties_ptr->scratch_size = packet->private_segment_size; + kernel_properties_ptr->vgpr_count = kernel_code->reserved_vgpr_count; + kernel_properties_ptr->sgpr_count = kernel_code->reserved_sgpr_count; + kernel_properties_ptr->fbarrier_count = kernel_code->workgroup_fbarrier_count; + kernel_properties_ptr->signal = packet->completion_signal; + // context properties rocprofiler_properties_t properties{}; properties.handler = (result_prefix != NULL) ? context_handler : NULL; @@ -660,7 +700,7 @@ static hsa_status_t info_callback(const rocprofiler_info_data_t info, void * arg return HSA_STATUS_SUCCESS; } -std::string normalize_token(const std::string token, bool not_empty, std::string label) { +std::string normalize_token(const std::string& token, bool not_empty, const std::string& label) { const std::string space_chars_set = " \t"; const size_t first_pos = token.find_first_not_of(space_chars_set); size_t norm_len = 0; @@ -676,23 +716,17 @@ std::string normalize_token(const std::string token, bool not_empty, std::string } if (((first_pos != std::string::npos) && (norm_len == 0)) || ((first_pos == std::string::npos) && not_empty)) { - fatal(label + ": " + error_str); + fatal("normalize_token error, " + label + ": '" + token + "'," + error_str); } return (norm_len != 0) ? token.substr(first_pos, norm_len) : std::string(""); } -int get_xml_array(xml::Xml* xml, const std::string& tag, const std::string& field, const std::string& delim, std::vector* vec, const char* label = NULL) { +int get_xml_array(const xml::Xml::level_t* node, const std::string& field, const std::string& delim, std::vector* vec, const char* label = NULL) { int parse_iter = 0; - auto nodes = xml->GetNodes(tag); - auto rit = nodes.rbegin(); - auto rend = nodes.rend(); - while (rit != rend) { - auto& opts = (*rit)->opts; - if (opts.find(field) != opts.end()) break; - ++rit; - } - if (rit != rend) { - const std::string array_string = (*rit)->opts[field]; + const auto& opts = node->opts; + auto it = opts.find(field); + if (it != opts.end()) { + const std::string array_string = it->second; if (label != NULL) printf("%s%s = %s\n", label, field.c_str(), array_string.c_str()); size_t pos1 = 0; const size_t string_len = array_string.length(); @@ -701,14 +735,30 @@ int get_xml_array(xml::Xml* xml, const std::string& tag, const std::string& fiel const bool found = (pos2 != std::string::npos); const size_t token_len = (pos2 != std::string::npos) ? pos2 - pos1 : string_len - pos1; const std::string token = array_string.substr(pos1, token_len); - const std::string norm_str = normalize_token(token, found, "Tokens array parsing error, file '" + xml->GetName() + "', " + tag + "::" + field); + const std::string norm_str = normalize_token(token, found, "get_xml_array"); if (norm_str.length() != 0) vec->push_back(norm_str); if (!found) break; pos1 = pos2 + 1; ++parse_iter; } } + return parse_iter; +} +int get_xml_array(xml::Xml* xml, const std::string& tag, const std::string& field, const std::string& delim, std::vector* vec, const char* label = NULL) { + int parse_iter = 0; + const auto nodes = xml->GetNodes(tag); + auto rit = nodes.rbegin(); + const auto rend = nodes.rend(); + while (rit != rend) { + auto& opts = (*rit)->opts; + if (opts.find(field) != opts.end()) break; + ++rit; + } + if (rit != rend) { + parse_iter = get_xml_array(*rit, field, delim, vec, label); + //fatal("Tokens array parsing error, file '" + xml->GetName() + "', " + tag + "::" + field); + } return parse_iter; } @@ -765,6 +815,8 @@ extern "C" PUBLIC_API void OnLoadToolProp(rocprofiler_settings_t* settings) if (it != opts.end()) { to_truncate_names = (it->second == "on") ? 1 : 0; } it = opts.find("timestamp"); if (it != opts.end()) { settings->timestamp_on = (it->second == "on") ? 1 : 0; } + it = opts.find("ctx-wait"); + if (it != opts.end()) { CTX_OUTSTANDING_WAIT = atol(it->second.c_str()); } it = opts.find("ctx-limit"); if (it != opts.end()) { CTX_OUTSTANDING_MAX = atol(it->second.c_str()); } it = opts.find("heartbeat"); @@ -789,6 +841,7 @@ extern "C" PUBLIC_API void OnLoadToolProp(rocprofiler_settings_t* settings) // Enable kernel names truncating check_env_var("ROCP_TRUNCATE_NAMES", to_truncate_names); // Set outstanding dispatches parameter + check_env_var("ROCP_OUTSTANDING_WAIT", CTX_OUTSTANDING_WAIT); check_env_var("ROCP_OUTSTANDING_MAX", CTX_OUTSTANDING_MAX); check_env_var("ROCP_OUTSTANDING_MON", CTX_OUTSTANDING_MON); // Enable timestamping @@ -884,10 +937,7 @@ extern "C" PUBLIC_API void OnLoadToolProp(rocprofiler_settings_t* settings) range_vec->push_back(*(range_vec->begin()) + 1); } - // Getting traces - auto traces_list = xml->GetNodes("top.trace"); - - const unsigned feature_count = metrics_vec.size() + traces_list.size(); + const unsigned feature_count = metrics_vec.size(); rocprofiler_feature_t* features = new rocprofiler_feature_t[feature_count]; memset(features, 0, feature_count * sizeof(rocprofiler_feature_t)); @@ -901,71 +951,7 @@ extern "C" PUBLIC_API void OnLoadToolProp(rocprofiler_settings_t* settings) } if (metrics_vec.size()) printf("\n"); - printf(" %d traces\n", (int)traces_list.size()); - unsigned index = metrics_vec.size(); - for (auto* entry : traces_list) { - auto params_list = xml->GetNodes("top.trace.parameters"); - if (params_list.size() > 1) { - fatal("ROCProfiler: Single input 'parameters' section is supported"); - } - std::string name = ""; - bool to_copy_data = false; - for (const auto& opt : entry->opts) { - if (opt.first == "name") name = opt.second; - else if (opt.first == "copy") to_copy_data = (opt.second == "true"); - else fatal("ROCProfiler: Bad trace property '" + opt.first + "'"); - } - if (name == "") fatal("ROCProfiler: Bad trace properties, name is not specified"); - - std::map parameters_dict; - parameters_dict["TARGET_CU"] = - HSA_VEN_AMD_AQLPROFILE_PARAMETER_NAME_COMPUTE_UNIT_TARGET; - parameters_dict["VM_ID_MASK"] = - HSA_VEN_AMD_AQLPROFILE_PARAMETER_NAME_VM_ID_MASK; - parameters_dict["MASK"] = - HSA_VEN_AMD_AQLPROFILE_PARAMETER_NAME_MASK; - parameters_dict["TOKEN_MASK"] = - HSA_VEN_AMD_AQLPROFILE_PARAMETER_NAME_TOKEN_MASK; - parameters_dict["TOKEN_MASK2"] = - HSA_VEN_AMD_AQLPROFILE_PARAMETER_NAME_TOKEN_MASK2; -#ifdef AQLPROF_NEW_API - parameters_dict["SE_MASK"] = - HSA_VEN_AMD_AQLPROFILE_PARAMETER_NAME_SE_MASK; -#endif - - printf(" %s (", name.c_str()); - features[index] = {}; - features[index].kind = ROCPROFILER_FEATURE_KIND_TRACE; - features[index].name = strdup(name.c_str()); - features[index].data.result_bytes.copy = to_copy_data; - - for (auto* params : params_list) { - const unsigned parameter_count = params->opts.size(); - rocprofiler_parameter_t* parameters = new rocprofiler_parameter_t[parameter_count]; - unsigned p_index = 0; - for (auto& v : params->opts) { - const std::string parameter_name = v.first; - if (parameters_dict.find(parameter_name) == parameters_dict.end()) { - fprintf(stderr, "ROCProfiler: unknown trace parameter '%s'\n", parameter_name.c_str()); - abort(); - } - const uint32_t value = strtol(v.second.c_str(), NULL, 0); - printf("\n %s = 0x%x", parameter_name.c_str(), value); - parameters[p_index] = {}; - parameters[p_index].parameter_name = parameters_dict[parameter_name]; - parameters[p_index].value = value; - ++p_index; - } - - features[index].parameters = parameters; - features[index].parameter_count = parameter_count; - } - if (params_list.empty() == false) printf("\n "); - printf(")\n"); - fflush(stdout); - ++index; - } - fflush(stdout); + const uint32_t features_found = metrics_vec.size(); // Context array aloocation context_array = new context_array_t; @@ -977,7 +963,7 @@ extern "C" PUBLIC_API void OnLoadToolProp(rocprofiler_settings_t* settings) callbacks_data = new callbacks_data_t{}; callbacks_data->features = features; - callbacks_data->feature_count = feature_count; + callbacks_data->feature_count = features_found; callbacks_data->set = (metrics_set->empty()) ? NULL : metrics_set; callbacks_data->group_index = 0; callbacks_data->file_handle = result_file_handle; @@ -1022,13 +1008,13 @@ extern "C" PUBLIC_API void OnUnloadTool() { fflush(stdout); if (result_file_opened) { printf("\nROCPRofiler:"); fflush(stdout); - dump_context_array(NULL); + if (CTX_OUTSTANDING_WAIT == 1) dump_context_array(NULL); fclose(result_file_handle); printf(" %u contexts collected, output directory %s\n", context_collected, result_prefix); } else { if (context_collected != context_count) { results_output_break(); - dump_context_array(NULL); + if (CTX_OUTSTANDING_WAIT == 1) dump_context_array(NULL); } printf("\nROCPRofiler: %u contexts collected\n", context_collected); } From e029fd1ca4d8c0c3dc18430dfdfdce786d49aafd Mon Sep 17 00:00:00 2001 From: Evgeny Date: Mon, 4 Feb 2019 19:52:04 -0600 Subject: [PATCH 024/153] removing scripts, moved as bin --- script/tblextr.py | 118 ---------------------------------------------- 1 file changed, 118 deletions(-) delete mode 100755 script/tblextr.py diff --git a/script/tblextr.py b/script/tblextr.py deleted file mode 100755 index f6a37dc0..00000000 --- a/script/tblextr.py +++ /dev/null @@ -1,118 +0,0 @@ -################################################################################ -# Copyright (c) 2018 Advanced Micro Devices, Inc. All rights reserved. -# -# Permission is hereby granted, free of charge, to any person obtaining a copy -# of this software and associated documentation files (the "Software"), to deal -# in the Software without restriction, including without limitation the rights -# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -# copies of the Software, and to permit persons to whom the Software is -# furnished to do so, subject to the following conditions: -# -# The above copyright notice and this permission notice shall be included in -# all copies or substantial portions of the Software. -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN -# THE SOFTWARE. -################################################################################ - -#!/usr/bin/python -import os, sys, re - -# Parsing results in the format: -#dispatch[0], queue_index(0), kernel_name("SimpleConvolution"), time(1048928000311041,1048928006154674,1048928006168274,1048928006170503): -# GRBM_GUI_ACTIVE (74332) -# SQ_WAVES (4096) -# SQ_INSTS_VMEM_RD (36864) - -# global vars -var_list = ['Index', 'KernelName', 'DispatchNs', 'BeginNs', 'EndNs', 'CompleteNs'] -var_table = {} -############################################################# - -def fatal(msg): - sys.stderr.write(sys.argv[0] + ": " + msg + "\n"); - sys.exit(1) -############################################################# - -# parse results method -def parse_res(infile): - if not os.path.isfile(infile): fatal("Error: input file '" + infile + "' not found") - inp = open(infile, 'r') - - beg_pattern = re.compile("^dispatch\[(\d*)\], queue_index\(\d*\), kernel_name\(\"([^\"]*)\"\)") - ts_pattern = re.compile(", time\((\d*),(\d*),(\d*),(\d*)\)") - var_pattern = re.compile("^\s*([^\s]*)\s+\((\d*)\)") - - dispatch_number = 0 - for line in inp.readlines(): - record = line[:-1] - - m = var_pattern.match(record) - if m: - if not dispatch_number in var_table: fatal("Error: dispatch number not unique '" + str(dispatch_number) + "'") - var = m.group(1) - val = m.group(2) - var_table[dispatch_number][m.group(1)] = m.group(2) - if not var in var_list: var_list.append(var) - - m = beg_pattern.match(record) - if m: - dispatch_number = m.group(1) - if not dispatch_number in var_table: - var_table[dispatch_number] = { - 'Index': dispatch_number, - 'KernelName': "\"" + m.group(2) + "\"" - } - m = ts_pattern.search(record) - if m: - var_table[dispatch_number]['DispatchNs'] = m.group(1) - var_table[dispatch_number]['BeginNs'] = m.group(2) - var_table[dispatch_number]['EndNs'] = m.group(3) - var_table[dispatch_number]['CompleteNs'] = m.group(4) - - inp.close() -############################################################# - -# print results table method -def print_tbl(outfile): - global var_list - - out = open(outfile, 'w') - - keys = sorted(var_table.keys(), key=int) - - entry = var_table[keys[0]] - list1 = [] - for var in var_list: - if var in entry: - list1.append(var) - var_list = list1 - - for var in var_list: out.write(var + ',') - out.write("\n") - - for ind in keys: - entry = var_table[ind] - dispatch_number = entry['Index'] - if ind != dispatch_number: fatal("Dispatch #" + ind + " index mismatch (" + dispatch_number + ")\n") - for var in var_list: out.write(entry[var] + ',') - out.write("\n") - - out.close() -############################################################# - -# main -if (len(sys.argv) < 3): fatal("Usage: " + sys.argv[0] + " ") - -outfile = sys.argv[1] -infiles = sys.argv[2:] -for f in infiles : - parse_res(f) -print_tbl(outfile) -sys.exit(0) -############################################################# From 9e00d58891b0fc3f84eae5bfb455973123388b2d Mon Sep 17 00:00:00 2001 From: Evgeny Date: Mon, 4 Feb 2019 19:58:09 -0600 Subject: [PATCH 025/153] readme: adding 'rocprof' hsa trace option --- README.md | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/README.md b/README.md index 6f02bca0..ed0d3709 100644 --- a/README.md +++ b/README.md @@ -67,7 +67,7 @@ Options: --list-derived - to print the list of derived metrics with formulas -i <.txt|.xml file> - input file - Input file .txt format, automatically rerun application for every pmc/sqtt line: + Input file .txt format, automatically rerun application for every pmc line: # Perf counters group 1 pmc : Wavefronts VALUInsts SALUInsts SFetchInsts FlatVMemInsts LDSInsts FlatLDSInsts GDSInsts FetchSize @@ -107,21 +107,20 @@ Options: --timestamp - to turn on/off the kernel disoatches timestamps, dispatch/begin/end/complete [off] --ctx-limit - maximum number of outstanding contexts [0 - unlimited] --heartbeat - to print progress heartbeats [0 - disabled] - --sqtt-size - to set SQTT buffer size, aggregate for all SE [0x2000000] - Can be set in KB (1024B) or MB (1048576) units, examples 20K or 20M respectively. - --sqtt-local - to allocate SQTT buffer in local GPU memory [on] + + --stats - generating kernel executino stats + --hsa-trace - to trace HSA, generates API execution stats and JSON file viewable in chrome tracing + --hip-trace - to trace HIP, generates API execution stats and JSON file viewable in chrome tracing Configuration file: You can set your parameters defaults preferences in the configuration file 'rpl_rc.xml'. The search path sequence: .:/home/evgeny: First the configuration file is looking in the current directory, then in your home, and then in the package directory. - Configurable options: 'basenames', 'timestamp', 'ctx-limit', 'heartbeat', 'sqtt-size', 'sqtt-local'. + Configurable options: 'basenames', 'timestamp', 'ctx-limit', 'heartbeat'. An example of 'rpl_rc.xml': ``` From 3eb77389dfc22889688e2b661e5859055182b1c8 Mon Sep 17 00:00:00 2001 From: Evgeny Date: Mon, 4 Feb 2019 20:19:30 -0600 Subject: [PATCH 026/153] readme: generated files info --- README.md | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index ed0d3709..1894c562 100644 --- a/README.md +++ b/README.md @@ -108,9 +108,12 @@ Options: --ctx-limit - maximum number of outstanding contexts [0 - unlimited] --heartbeat - to print progress heartbeats [0 - disabled] - --stats - generating kernel executino stats + --stats - generating kernel executino stats, file .stats.csv --hsa-trace - to trace HSA, generates API execution stats and JSON file viewable in chrome tracing + Generated files: .hsa_stats.txt .json --hip-trace - to trace HIP, generates API execution stats and JSON file viewable in chrome tracing + Generated files: .hip_stats.txt .json + Configuration file: You can set your parameters defaults preferences in the configuration file 'rpl_rc.xml'. The search path sequence: .:/home/evgeny: From f56e53a5c0e08c988fb7608e2b8668fb8ceed1cb Mon Sep 17 00:00:00 2001 From: Evgeny Date: Mon, 4 Feb 2019 20:25:07 -0600 Subject: [PATCH 027/153] tracing info help --- README.md | 3 +-- bin/rocprof | 1 + bin/rpl_run.sh | 4 +++- 3 files changed, 5 insertions(+), 3 deletions(-) create mode 120000 bin/rocprof diff --git a/README.md b/README.md index 1894c562..210e9c5e 100644 --- a/README.md +++ b/README.md @@ -58,7 +58,7 @@ The library source tree: ## Profiling utility usage: ``` - rpl_run.sh [-h] [--list-basic] [--list-derived] [-i ] [-o ] + rocprof [-h] [--list-basic] [--list-derived] [-i ] [-o ] Options: -h - this help @@ -114,7 +114,6 @@ Options: --hip-trace - to trace HIP, generates API execution stats and JSON file viewable in chrome tracing Generated files: .hip_stats.txt .json - Configuration file: You can set your parameters defaults preferences in the configuration file 'rpl_rc.xml'. The search path sequence: .:/home/evgeny: First the configuration file is looking in the current directory, then in your home, and then in the package directory. diff --git a/bin/rocprof b/bin/rocprof new file mode 120000 index 00000000..e3aaad4e --- /dev/null +++ b/bin/rocprof @@ -0,0 +1 @@ +rpl_run.sh \ No newline at end of file diff --git a/bin/rpl_run.sh b/bin/rpl_run.sh index adefad73..91fd6703 100755 --- a/bin/rpl_run.sh +++ b/bin/rpl_run.sh @@ -140,9 +140,11 @@ usage() { echo " --ctx-limit - maximum number of outstanding contexts [0 - unlimited]" echo " --heartbeat - to print progress heartbeats [0 - disabled]" echo "" - echo " --stats - generating kernel executino stats" + echo " --stats - generating kernel execution stats, file .stats.csv" echo " --hsa-trace - to trace HSA, generates API execution stats and JSON file viewable in chrome tracing" + echo " Generated files: .hsa_stats.txt .json" echo " --hip-trace - to trace HIP, generates API execution stats and JSON file viewable in chrome tracing" + echo " Generated files: .hip_stats.txt .json" echo "" echo "Configuration file:" echo " You can set your parameters defaults preferences in the configuration file 'rpl_rc.xml'. The search path sequence: .:${HOME}:" From 83bc8e6a4c3b5d2fa207ca2fa4936ad05fe11ec6 Mon Sep 17 00:00:00 2001 From: eshcherb <33529668+eshcherb@users.noreply.github.com> Date: Mon, 4 Feb 2019 20:37:45 -0600 Subject: [PATCH 028/153] Update README.md --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 210e9c5e..60fd9a3a 100644 --- a/README.md +++ b/README.md @@ -6,7 +6,7 @@ HW specific low-level performance analysis interface for profiling of GPU comput The library source tree: - bin - - rpl_run.sh - Profiling tool run script + - rocprof - Profiling tool run script - doc - Documentation - inc/rocprofiler.h - Library public API - src - Library sources @@ -34,7 +34,7 @@ The library source tree: cd .../rocprofiler mkdir build cd build - export CMAKE_PREFIX_PATH=/opt/rocm/lib:/opt/rocm/include/hsa + export CMAKE_PREFIX_PATH=/opt/rocm cmake -DCMAKE_INSTALL_PREFIX=/opt/rocm .. make make install From 758ddf5d5d9d71cb34e8dcf4a69bdb98be90c477 Mon Sep 17 00:00:00 2001 From: eshcherb <33529668+eshcherb@users.noreply.github.com> Date: Tue, 19 Feb 2019 16:05:21 -0600 Subject: [PATCH 029/153] Update README.md --- README.md | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 60fd9a3a..34ef3f38 100644 --- a/README.md +++ b/README.md @@ -4,7 +4,10 @@ ROC profiler library. Profiling with perf-counters and derived metrics. Library HW specific low-level performance analysis interface for profiling of GPU compute applications. The profiling includes HW performance counters with complex performance metrics and HW traces -The library source tree: +[Profiler default metrics](test/tool/metrics.xml) + +## Source tree: +``` - bin - rocprof - Profiling tool run script - doc - Documentation @@ -20,6 +23,7 @@ The library source tree: - ctrl - Test controll - util - Test utils - simple_convolution - Simple convolution test kernel +``` ## Build environment: ``` From 67b17cb95f1463de0844e1e9604c46faea2b9f21 Mon Sep 17 00:00:00 2001 From: eshcherb <33529668+eshcherb@users.noreply.github.com> Date: Tue, 19 Feb 2019 16:07:02 -0600 Subject: [PATCH 030/153] Update README.md --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 34ef3f38..ea99af25 100644 --- a/README.md +++ b/README.md @@ -4,7 +4,7 @@ ROC profiler library. Profiling with perf-counters and derived metrics. Library HW specific low-level performance analysis interface for profiling of GPU compute applications. The profiling includes HW performance counters with complex performance metrics and HW traces -[Profiler default metrics](test/tool/metrics.xml) +[Profiler default metrics XML specification](test/tool/metrics.xml) ## Source tree: ``` From 61c17b282486f43586f7753aaf0f6580a1d4a196 Mon Sep 17 00:00:00 2001 From: eshcherb <33529668+eshcherb@users.noreply.github.com> Date: Tue, 19 Feb 2019 16:08:10 -0600 Subject: [PATCH 031/153] Update README.md --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index ea99af25..c77ab5c3 100644 --- a/README.md +++ b/README.md @@ -4,7 +4,7 @@ ROC profiler library. Profiling with perf-counters and derived metrics. Library HW specific low-level performance analysis interface for profiling of GPU compute applications. The profiling includes HW performance counters with complex performance metrics and HW traces -[Profiler default metrics XML specification](test/tool/metrics.xml) +[The link to profiler default metrics XML specification](test/tool/metrics.xml) ## Source tree: ``` From efdb465389bad87820194f1360c7f880516187ea Mon Sep 17 00:00:00 2001 From: eshcherb <33529668+eshcherb@users.noreply.github.com> Date: Tue, 19 Feb 2019 16:17:13 -0600 Subject: [PATCH 032/153] Update README.md --- README.md | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index c77ab5c3..edfc0e58 100644 --- a/README.md +++ b/README.md @@ -115,8 +115,14 @@ Options: --stats - generating kernel executino stats, file .stats.csv --hsa-trace - to trace HSA, generates API execution stats and JSON file viewable in chrome tracing Generated files: .hsa_stats.txt .json - --hip-trace - to trace HIP, generates API execution stats and JSON file viewable in chrome tracing - Generated files: .hip_stats.txt .json + Traced API list can be set by input .txt or .xml files. + Input .txt: + hsa: hsa_queue_create hsa_amd_memory_pool_allocate + Input .xml: + + + + Configuration file: You can set your parameters defaults preferences in the configuration file 'rpl_rc.xml'. The search path sequence: .:/home/evgeny: From 29fe11b4aacb5da603393ab614b52d4174aa2ff2 Mon Sep 17 00:00:00 2001 From: eshcherb <33529668+eshcherb@users.noreply.github.com> Date: Wed, 20 Feb 2019 14:37:01 -0600 Subject: [PATCH 033/153] Update README.md --- README.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index edfc0e58..1b01636e 100644 --- a/README.md +++ b/README.md @@ -113,7 +113,8 @@ Options: --heartbeat - to print progress heartbeats [0 - disabled] --stats - generating kernel executino stats, file .stats.csv - --hsa-trace - to trace HSA, generates API execution stats and JSON file viewable in chrome tracing + --hip-trace - to trace HIP, generates API execution stats/trace and JSON file viewable in chrome tracing + --hsa-trace - to trace HSA, generates API execution stats/trace and JSON file viewable in chrome tracing Generated files: .hsa_stats.txt .json Traced API list can be set by input .txt or .xml files. Input .txt: From baf874f85acc9dd8349057c09e4ed65ddb763b75 Mon Sep 17 00:00:00 2001 From: eshcherb <33529668+eshcherb@users.noreply.github.com> Date: Wed, 20 Feb 2019 17:23:56 -0600 Subject: [PATCH 034/153] Update README.md --- README.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 1b01636e..6049fba6 100644 --- a/README.md +++ b/README.md @@ -2,7 +2,8 @@ ROC profiler library. Profiling with perf-counters and derived metrics. Library supports GFX8/GFX9. -HW specific low-level performance analysis interface for profiling of GPU compute applications. The profiling includes HW performance counters with complex performance metrics and HW traces +HW specific low-level performance analysis interface for profiling of GPU compute applications. The profiling includes HW performance counters with complex performance metrics and HW traces. +There two usage modes for counters access, system wide sampling and accumulating per kernels. In per kernel usage mode the kernels execution is serialized. [The link to profiler default metrics XML specification](test/tool/metrics.xml) From 4d1a2edabf3cb3bf9473ceadeafa62e782aa98a3 Mon Sep 17 00:00:00 2001 From: eshcherb <33529668+eshcherb@users.noreply.github.com> Date: Wed, 20 Feb 2019 17:25:18 -0600 Subject: [PATCH 035/153] Update README.md --- README.md | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 6049fba6..385401b5 100644 --- a/README.md +++ b/README.md @@ -1,11 +1,12 @@ # ROC-profiler - +``` ROC profiler library. Profiling with perf-counters and derived metrics. Library supports GFX8/GFX9. HW specific low-level performance analysis interface for profiling of GPU compute applications. The profiling includes HW performance counters with complex performance metrics and HW traces. -There two usage modes for counters access, system wide sampling and accumulating per kernels. In per kernel usage mode the kernels execution is serialized. +There two usage modes for counters access, system wide sampling and per kernels accumulating. In per kernel usage mode the kernels execution is serialized. [The link to profiler default metrics XML specification](test/tool/metrics.xml) +``` ## Source tree: ``` From 1dd85febd918a96a200eb14c5380aeee64589af1 Mon Sep 17 00:00:00 2001 From: eshcherb <33529668+eshcherb@users.noreply.github.com> Date: Wed, 20 Feb 2019 17:26:20 -0600 Subject: [PATCH 036/153] Update README.md --- README.md | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 385401b5..839da33e 100644 --- a/README.md +++ b/README.md @@ -2,8 +2,10 @@ ``` ROC profiler library. Profiling with perf-counters and derived metrics. Library supports GFX8/GFX9. -HW specific low-level performance analysis interface for profiling of GPU compute applications. The profiling includes HW performance counters with complex performance metrics and HW traces. -There two usage modes for counters access, system wide sampling and per kernels accumulating. In per kernel usage mode the kernels execution is serialized. +HW specific low-level performance analysis interface for profiling of GPU compute applications. The +profiling includes HW performance counters with complex performance metrics and HW traces. +There two usage modes for counters access, system wide sampling and per kernels accumulating. In per +kernel usage mode the kernels execution is serialized. [The link to profiler default metrics XML specification](test/tool/metrics.xml) ``` From e5982b9ffee7b56e7aeca295bc3f916e036dd35a Mon Sep 17 00:00:00 2001 From: eshcherb <33529668+eshcherb@users.noreply.github.com> Date: Wed, 20 Feb 2019 17:26:42 -0600 Subject: [PATCH 037/153] Update README.md --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 839da33e..bd7e8f36 100644 --- a/README.md +++ b/README.md @@ -6,9 +6,9 @@ HW specific low-level performance analysis interface for profiling of GPU comput profiling includes HW performance counters with complex performance metrics and HW traces. There two usage modes for counters access, system wide sampling and per kernels accumulating. In per kernel usage mode the kernels execution is serialized. - -[The link to profiler default metrics XML specification](test/tool/metrics.xml) ``` +[The link to profiler default metrics XML specification](test/tool/metrics.xml) + ## Source tree: ``` From d37c0f41c6d2dec537b48339844216dc57e8d12e Mon Sep 17 00:00:00 2001 From: eshcherb <33529668+eshcherb@users.noreply.github.com> Date: Wed, 20 Feb 2019 20:42:35 -0600 Subject: [PATCH 038/153] Update rocprofiler_spec.md --- doc/rocprofiler_spec.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/doc/rocprofiler_spec.md b/doc/rocprofiler_spec.md index 001bcbe1..a8ef7a2b 100644 --- a/doc/rocprofiler_spec.md +++ b/doc/rocprofiler_spec.md @@ -14,7 +14,7 @@ The library has C API and is based on AQLprofile AMD specific HSA extension. 1. The library provides methods to query the list of supported HW features. 2. The library provides profiling APIs to start, stop, read metrics results and tracing data. - 3. The library provides a callback API for collecting per-kernel profiling data for + 3. The library provides a intercepting API for collecting per-kernel profiling data for the kernels dispatched to HSA AQL queues. 4. The library provides mechanism to load profiling tool library plugin by env variable @@ -427,6 +427,7 @@ hsa_status_t rocprofiler_group_get_data( ``` The library provides a callback API for enabling profiling for the kernels dispatched to HSA AQL queues. The API enables per-kernel profiling data collection. +Currently implemented the option with serializing the kernels execution. ROC profiler callback type: From b8f8e410c030bed0b816c75a8be1eab0ade1a4c6 Mon Sep 17 00:00:00 2001 From: eshcherb <33529668+eshcherb@users.noreply.github.com> Date: Wed, 20 Feb 2019 20:44:11 -0600 Subject: [PATCH 039/153] Update README.md --- README.md | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/README.md b/README.md index bd7e8f36..31e8458f 100644 --- a/README.md +++ b/README.md @@ -3,9 +3,7 @@ ROC profiler library. Profiling with perf-counters and derived metrics. Library supports GFX8/GFX9. HW specific low-level performance analysis interface for profiling of GPU compute applications. The -profiling includes HW performance counters with complex performance metrics and HW traces. -There two usage modes for counters access, system wide sampling and per kernels accumulating. In per -kernel usage mode the kernels execution is serialized. +profiling includes HW performance counters with complex performance metrics. ``` [The link to profiler default metrics XML specification](test/tool/metrics.xml) From 3ad6f22ecbf17ab87090d7ea55ad4879cf0891fb Mon Sep 17 00:00:00 2001 From: eshcherb <33529668+eshcherb@users.noreply.github.com> Date: Wed, 20 Feb 2019 20:45:01 -0600 Subject: [PATCH 040/153] Update README.md --- README.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/README.md b/README.md index 31e8458f..b884ad82 100644 --- a/README.md +++ b/README.md @@ -5,6 +5,8 @@ ROC profiler library. Profiling with perf-counters and derived metrics. Library HW specific low-level performance analysis interface for profiling of GPU compute applications. The profiling includes HW performance counters with complex performance metrics. ``` + +## Metrics [The link to profiler default metrics XML specification](test/tool/metrics.xml) From f1718391c5d670184040331d66a81df24cc65eb2 Mon Sep 17 00:00:00 2001 From: eshcherb <33529668+eshcherb@users.noreply.github.com> Date: Wed, 20 Feb 2019 20:45:41 -0600 Subject: [PATCH 041/153] Update README.md --- README.md | 2 -- 1 file changed, 2 deletions(-) diff --git a/README.md b/README.md index b884ad82..48b09814 100644 --- a/README.md +++ b/README.md @@ -1,10 +1,8 @@ # ROC-profiler -``` ROC profiler library. Profiling with perf-counters and derived metrics. Library supports GFX8/GFX9. HW specific low-level performance analysis interface for profiling of GPU compute applications. The profiling includes HW performance counters with complex performance metrics. -``` ## Metrics [The link to profiler default metrics XML specification](test/tool/metrics.xml) From f7278ac40938bcd60efaf5e69ebb92c2265c0ec1 Mon Sep 17 00:00:00 2001 From: eshcherb <33529668+eshcherb@users.noreply.github.com> Date: Wed, 20 Feb 2019 20:46:05 -0600 Subject: [PATCH 042/153] Update README.md --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 48b09814..71b673c6 100644 --- a/README.md +++ b/README.md @@ -8,7 +8,7 @@ profiling includes HW performance counters with complex performance metrics. [The link to profiler default metrics XML specification](test/tool/metrics.xml) -## Source tree: +## Source tree ``` - bin - rocprof - Profiling tool run script From 0ae956103729aed2c1f86964e53516808a07a87e Mon Sep 17 00:00:00 2001 From: Evgeny Date: Tue, 12 Mar 2019 11:49:29 -0500 Subject: [PATCH 043/153] 2.2 update --- bin/rpl_run.sh | 12 +++++-- bin/run_tool.sh | 2 +- inc/rocprofiler.h | 1 + src/core/rocprofiler.cpp | 59 ++++++++++++++++++++++++++++++- test/app/intercept_test_stand.cpp | 2 +- test/run.sh | 13 ++++--- test/tool/tool.cpp | 4 +++ 7 files changed, 82 insertions(+), 11 deletions(-) diff --git a/bin/rpl_run.sh b/bin/rpl_run.sh index 91fd6703..d94ee76f 100755 --- a/bin/rpl_run.sh +++ b/bin/rpl_run.sh @@ -141,10 +141,16 @@ usage() { echo " --heartbeat - to print progress heartbeats [0 - disabled]" echo "" echo " --stats - generating kernel execution stats, file .stats.csv" - echo " --hsa-trace - to trace HSA, generates API execution stats and JSON file viewable in chrome tracing" + echo " --hsa-trace - to trace HSA, generates API execution stats and JSON file chrome-tracing compatible" echo " Generated files: .hsa_stats.txt .json" - echo " --hip-trace - to trace HIP, generates API execution stats and JSON file viewable in chrome tracing" - echo " Generated files: .hip_stats.txt .json" + echo " Traced API list can be set by input .txt or .xml files." + echo " Input .txt:" + echo " hsa: hsa_queue_create hsa_amd_memory_pool_allocate" + echo " Input .xml:" + echo " " + echo " " + echo " " + echo " " echo "" echo "Configuration file:" echo " You can set your parameters defaults preferences in the configuration file 'rpl_rc.xml'. The search path sequence: .:${HOME}:" diff --git a/bin/run_tool.sh b/bin/run_tool.sh index 5af6d1a1..5ee438c0 100755 --- a/bin/run_tool.sh +++ b/bin/run_tool.sh @@ -27,7 +27,7 @@ fi export HSA_TOOLS_REPORT_LOAD_FAILURE=1 export HSA_VEN_AMD_AQLPROFILE_LOG=1 export ROCPROFILER_LOG=1 -# to prevent internal simple proxy queue +# ROC profiler metrics config file unset ROCP_PROXY_QUEUE # ROC profiler metrics config file export ROCP_METRICS=$BIN_DIR/lib/metrics.xml diff --git a/inc/rocprofiler.h b/inc/rocprofiler.h index 6aeb26af..5449204b 100644 --- a/inc/rocprofiler.h +++ b/inc/rocprofiler.h @@ -64,6 +64,7 @@ uint32_t rocprofiler_version_minor(); typedef struct { uint32_t intercept_mode; + uint32_t memcopy_tracking; uint32_t sqtt_size; uint32_t sqtt_local; uint64_t timeout; diff --git a/src/core/rocprofiler.cpp b/src/core/rocprofiler.cpp index c3c4bd0c..dec62c5c 100644 --- a/src/core/rocprofiler.cpp +++ b/src/core/rocprofiler.cpp @@ -83,6 +83,9 @@ decltype(hsa_queue_load_read_index_scacquire)* hsa_queue_load_read_index_scacqui decltype(hsa_amd_queue_intercept_create)* hsa_amd_queue_intercept_create_fn; decltype(hsa_amd_queue_intercept_register)* hsa_amd_queue_intercept_register_fn; +decltype(hsa_amd_memory_async_copy)* hsa_amd_memory_async_copy_fn; +decltype(hsa_amd_memory_async_copy_rect)* hsa_amd_memory_async_copy_rect_fn; + ::HsaApiTable* kHsaApiTable; void SaveHsaApi(::HsaApiTable* table) { @@ -137,7 +140,8 @@ void * tool_handle = NULL; // Load profiling tool library // Return true if intercepting mode is enabled enum { - DISPATCH_INTERCEPT_MODE = 0x1 + DISPATCH_INTERCEPT_MODE = 0x1, + MEMCOPY_INTERCEPT_MODE = 0x2 }; uint32_t LoadTool() { uint32_t intercept_mode = 0; @@ -181,6 +185,7 @@ uint32_t LoadTool() { util::HsaRsrcFactory::SetTimeoutNs(settings.timeout); InterceptQueue::TrackerOn(settings.timestamp_on != 0); if (settings.intercept_mode != 0) intercept_mode = DISPATCH_INTERCEPT_MODE; + if (settings.memcopy_tracking) intercept_mode |= MEMCOPY_INTERCEPT_MODE; } return intercept_mode; @@ -313,6 +318,50 @@ hsa_status_t CreateQueuePro( return HSA_STATUS_SUCCESS; } +bool async_copy_handler(hsa_signal_value_t value, void* arg) { + Tracker::entry_t* entry = reinterpret_cast(arg); + printf("%lu: async-copy time(%lu,%lu)\n", entry->index, entry->record->begin, entry->record->end); + return false; +} + +hsa_status_t hsa_amd_memory_async_copy_interceptor( + void* dst, hsa_agent_t dst_agent, const void* src, + hsa_agent_t src_agent, size_t size, uint32_t num_dep_signals, + const hsa_signal_t* dep_signals, hsa_signal_t completion_signal) +{ + Tracker* tracker = &Tracker::Instance(); + Tracker::entry_t* tracker_entry = tracker->Alloc(hsa_agent_t{}, completion_signal); + hsa_status_t status = hsa_amd_memory_async_copy_fn(dst, dst_agent, src, + src_agent, size, num_dep_signals, + dep_signals, tracker_entry->signal); + if (status == HSA_STATUS_SUCCESS) { + tracker->EnableMemcopy(tracker_entry, async_copy_handler, reinterpret_cast(tracker_entry)); + } else { + tracker->Delete(tracker_entry); + } + return status; +} + +hsa_status_t hsa_amd_memory_async_copy_rect_interceptor( + const hsa_pitched_ptr_t* dst, const hsa_dim3_t* dst_offset, const hsa_pitched_ptr_t* src, + const hsa_dim3_t* src_offset, const hsa_dim3_t* range, hsa_agent_t copy_agent, + hsa_amd_copy_direction_t dir, uint32_t num_dep_signals, const hsa_signal_t* dep_signals, + hsa_signal_t completion_signal) +{ + Tracker* tracker = &Tracker::Instance(); + Tracker::entry_t* tracker_entry = tracker->Alloc(hsa_agent_t{}, completion_signal); + hsa_status_t status = hsa_amd_memory_async_copy_rect_fn(dst, dst_offset, src, + src_offset, range, copy_agent, + dir, num_dep_signals, dep_signals, + tracker_entry->signal); + if (status == HSA_STATUS_SUCCESS) { + tracker->EnableMemcopy(tracker_entry, async_copy_handler, reinterpret_cast(tracker_entry)); + } else { + tracker->Delete(tracker_entry); + } + return status; +} + rocprofiler_properties_t rocprofiler_properties; uint32_t SqttProfile::output_buffer_size_ = 0x2000000; // 32M bool SqttProfile::output_buffer_local_ = true; @@ -363,6 +412,14 @@ PUBLIC_API bool OnLoad(HsaApiTable* table, uint64_t runtime_version, uint64_t fa // Loading a tool lib and setting of intercept mode const uint32_t intercept_mode_mask = rocprofiler::LoadTool(); if (intercept_mode_mask & rocprofiler::DISPATCH_INTERCEPT_MODE) intercept_mode = true; + if (intercept_mode_mask & rocprofiler::MEMCOPY_INTERCEPT_MODE) { + hsa_status_t status = hsa_amd_profiling_async_copy_enable(true); + if (status != HSA_STATUS_SUCCESS) EXC_ABORT(status, "hsa_amd_profiling_async_copy_enable"); + rocprofiler::hsa_amd_memory_async_copy_fn = table->amd_ext_->hsa_amd_memory_async_copy_fn; + rocprofiler::hsa_amd_memory_async_copy_rect_fn = table->amd_ext_->hsa_amd_memory_async_copy_rect_fn; + table->amd_ext_->hsa_amd_memory_async_copy_fn = rocprofiler::hsa_amd_memory_async_copy_interceptor; + table->amd_ext_->hsa_amd_memory_async_copy_rect_fn = rocprofiler::hsa_amd_memory_async_copy_rect_interceptor; + } // HSA intercepting if (intercept_mode) { diff --git a/test/app/intercept_test_stand.cpp b/test/app/intercept_test_stand.cpp index 7e6298e7..de3dbdaf 100644 --- a/test/app/intercept_test_stand.cpp +++ b/test/app/intercept_test_stand.cpp @@ -178,7 +178,7 @@ int main() { TestHsa::HsaInstantiate(0); for (unsigned ind = 0; ind < kiter; ++ind) { - printf("Iteration %u:\n", ind); + printf("Iterastion %u:\n", ind); ret_val = RunKernel(0, NULL, diter); if (ret_val) ret_val = RunKernel(0, NULL, diter); } diff --git a/test/run.sh b/test/run.sh index 550ad5b1..580f4713 100755 --- a/test/run.sh +++ b/test/run.sh @@ -56,11 +56,6 @@ if [ ! -e $ROCP_TOOL_LIB ] ; then export ROCP_TOOL_LIB=test/libtool.so fi -export ROCP_KITER=1 -export ROCP_DITER=4 -export ROCP_INPUT=input1.xml -eval ./test/ctrl - export ROCP_KITER=50 export ROCP_DITER=50 export ROCP_AGENTS=1 @@ -68,6 +63,14 @@ export ROCP_THRS=1 export ROCP_INPUT=input.xml eval ./test/ctrl +# Memcopies tracking +export ROCP_MCOPY_TRACKING=1 + +export ROCP_KITER=1 +export ROCP_DITER=4 +export ROCP_INPUT=input1.xml +eval ./test/ctrl + #valgrind --leak-check=full $tbin #valgrind --tool=massif $tbin #ms_print massif.out. diff --git a/test/tool/tool.cpp b/test/tool/tool.cpp index 4853fd86..d96ab12c 100644 --- a/test/tool/tool.cpp +++ b/test/tool/tool.cpp @@ -834,6 +834,8 @@ extern "C" PUBLIC_API void OnLoadToolProp(rocprofiler_settings_t* settings) } it = opts.find("sqtt-local"); if (it != opts.end()) { settings->sqtt_local = (it->second == "on"); } + it = opts.find("memcopies"); + if (it != opts.end()) { settings->memcopy_tracking = (it->second == "on"); } } } // Enable verbose mode @@ -852,6 +854,8 @@ extern "C" PUBLIC_API void OnLoadToolProp(rocprofiler_settings_t* settings) check_env_var("ROCP_SQTT_SIZE", settings->sqtt_size); // Set SQTT local buffer check_env_var("ROCP_SQTT_LOCAL", settings->sqtt_local); + // Set memcopies tracking + check_env_var("ROCP_MCOPY_TRACKING", settings->memcopy_tracking); is_sqtt_local = settings->sqtt_local; From 528000626865a16e4960b55347ac34b458a6ccf2 Mon Sep 17 00:00:00 2001 From: eshcherb <33529668+eshcherb@users.noreply.github.com> Date: Tue, 19 Mar 2019 16:15:46 -0500 Subject: [PATCH 044/153] Update README.md --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 71b673c6..3868aeac 100644 --- a/README.md +++ b/README.md @@ -110,7 +110,7 @@ Options: By changing the temporary directory you can prevent removing the profiling data from /tmp or enable removing from not '/tmp' directory. --basenames - to turn on/off truncating of the kernel full function names till the base ones [off] - --timestamp - to turn on/off the kernel disoatches timestamps, dispatch/begin/end/complete [off] + --timestamp - to turn on/off the kernel dispatches timestamps, dispatch/begin/end/complete [off] --ctx-limit - maximum number of outstanding contexts [0 - unlimited] --heartbeat - to print progress heartbeats [0 - disabled] From 391bc82dce12a2cd1d9d7e65f1ecf76737ea1a78 Mon Sep 17 00:00:00 2001 From: eshcherb <33529668+eshcherb@users.noreply.github.com> Date: Tue, 2 Apr 2019 10:34:39 -0500 Subject: [PATCH 045/153] Update README.md --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 3868aeac..f7409480 100644 --- a/README.md +++ b/README.md @@ -123,7 +123,7 @@ Options: hsa: hsa_queue_create hsa_amd_memory_pool_allocate Input .xml: - + From 11535ba1e88de3e218fc9357dca806273de1bfec Mon Sep 17 00:00:00 2001 From: eshcherb <33529668+eshcherb@users.noreply.github.com> Date: Tue, 2 Apr 2019 20:12:41 -0500 Subject: [PATCH 046/153] Update README.md --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index f7409480..9ce54bbd 100644 --- a/README.md +++ b/README.md @@ -40,8 +40,8 @@ profiling includes HW performance counters with complex performance metrics. cd .../rocprofiler mkdir build cd build - export CMAKE_PREFIX_PATH=/opt/rocm - cmake -DCMAKE_INSTALL_PREFIX=/opt/rocm .. + export CMAKE_PREFIX_PATH=/opt/rocm/include/hsa:/opt/rocm + cmake .. make make install ``` From cda8b42ffdd7c5e204de338ef70a76200f157e68 Mon Sep 17 00:00:00 2001 From: eshcherb <33529668+eshcherb@users.noreply.github.com> Date: Tue, 2 Apr 2019 20:21:32 -0500 Subject: [PATCH 047/153] Update README.md --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index 9ce54bbd..c36a99c2 100644 --- a/README.md +++ b/README.md @@ -116,6 +116,7 @@ Options: --stats - generating kernel executino stats, file .stats.csv --hip-trace - to trace HIP, generates API execution stats/trace and JSON file viewable in chrome tracing + 'HCC_HOME' env va is required to be set to where 'hcc' is installed. --hsa-trace - to trace HSA, generates API execution stats/trace and JSON file viewable in chrome tracing Generated files: .hsa_stats.txt .json Traced API list can be set by input .txt or .xml files. From e8cc5960f8783f6ffdb48a86f44f3603411c09c5 Mon Sep 17 00:00:00 2001 From: eshcherb <33529668+eshcherb@users.noreply.github.com> Date: Tue, 2 Apr 2019 20:21:55 -0500 Subject: [PATCH 048/153] Update README.md --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index c36a99c2..711bc86b 100644 --- a/README.md +++ b/README.md @@ -116,7 +116,7 @@ Options: --stats - generating kernel executino stats, file .stats.csv --hip-trace - to trace HIP, generates API execution stats/trace and JSON file viewable in chrome tracing - 'HCC_HOME' env va is required to be set to where 'hcc' is installed. + 'HCC_HOME' env var is required to be set to where 'hcc' is installed. --hsa-trace - to trace HSA, generates API execution stats/trace and JSON file viewable in chrome tracing Generated files: .hsa_stats.txt .json Traced API list can be set by input .txt or .xml files. From cc130bd76502d0ee0d335c2b8f66b69ef7d389e9 Mon Sep 17 00:00:00 2001 From: Evgeny Date: Tue, 2 Apr 2019 23:18:25 -0500 Subject: [PATCH 049/153] fixing standalone intercepion for N-GPUs --- src/core/rocprofiler.cpp | 42 ++++++++++++++++++++++++++++------------ 1 file changed, 30 insertions(+), 12 deletions(-) diff --git a/src/core/rocprofiler.cpp b/src/core/rocprofiler.cpp index dec62c5c..b4ba5d4a 100644 --- a/src/core/rocprofiler.cpp +++ b/src/core/rocprofiler.cpp @@ -230,13 +230,12 @@ hsa_status_t GetExcStatus(const std::exception& e) { : HSA_STATUS_ERROR; } - -inline size_t CreateEnableCmd(const hsa_agent_t& agent, packet_t* command, const size_t& slot_count) { - rocprofiler::util::HsaRsrcFactory* hsa_rsrc = &rocprofiler::util::HsaRsrcFactory::Instance(); - const rocprofiler::util::AgentInfo* agent_info = hsa_rsrc->GetAgentInfo(agent); +inline size_t CreateEnableCmd(const rocprofiler::util::AgentInfo* agent_info, packet_t* command, const size_t& slot_count) { const bool is_legacy = (strncmp(agent_info->name, "gfx8", 4) == 0); const size_t packet_count = (is_legacy) ? Profile::LEGACY_SLOT_SIZE_PKT : 1; + rocprofiler::util::HsaRsrcFactory* hsa_rsrc = &rocprofiler::util::HsaRsrcFactory::Instance(); + if (packet_count > slot_count) EXC_RAISING(HSA_STATUS_ERROR, "packet_count > slot_count"); // AQLprofile object @@ -288,9 +287,13 @@ hsa_status_t CreateQueuePro( uint32_t group_segment_size, hsa_queue_t **queue) { - static packet_t enable_cmd_packet[Profile::LEGACY_SLOT_SIZE_PKT]; - static size_t enable_cmd_size = 0; - static std::mutex enable_cmd_mutex; + typedef std::pair cmd_entry_t; + typedef std::vector cmd_vec_t; + static cmd_vec_t cmd_vec; + static uint32_t cmd_mask = 0; + static std::mutex cmd_mutex; + + rocprofiler::util::HsaRsrcFactory* hsa_rsrc = &rocprofiler::util::HsaRsrcFactory::Instance(); // Create HSA queue hsa_status_t status = hsa_queue_create_fn( @@ -305,15 +308,30 @@ hsa_status_t CreateQueuePro( if (status != HSA_STATUS_SUCCESS) return status; // Create 'Enable' cmd packet - if (enable_cmd_size == 0) { - std::lock_guard lck(enable_cmd_mutex); - if (enable_cmd_size == 0) { - enable_cmd_size = CreateEnableCmd(agent, enable_cmd_packet, Profile::LEGACY_SLOT_SIZE_PKT); + const rocprofiler::util::AgentInfo* agent_info = hsa_rsrc->GetAgentInfo(agent); + const uint32_t dev_index = 1 << agent_info->dev_index; + const uint32_t dev_mask = 1 << dev_index; + if ((cmd_mask & dev_mask) == 0) { + std::lock_guard lck(cmd_mutex); + + if ((cmd_mask & dev_mask) == 0) { + cmd_mask |= dev_mask; + // Allocating cmd vector + uint32_t mask = 1; + while (1) { + const uint32_t max = 1 << cmd_vec.size(); + if (mask >= max) cmd_vec.push_back({}); + if (((mask & dev_mask) != 0) || (mask == 0)) break; + mask <<= 1; + } + if (mask == 0) EXC_RAISING(status, "bad device index (" << dev_index << ")"); + // Creating cmd packets + cmd_vec[dev_index].second = CreateEnableCmd(agent_info, cmd_vec[dev_index].first, Profile::LEGACY_SLOT_SIZE_PKT); } } // Enable counters for the queue - rocprofiler::util::HsaRsrcFactory::Instance().Submit(*queue, enable_cmd_packet, enable_cmd_size); + rocprofiler::util::HsaRsrcFactory::Instance().Submit(*queue, cmd_vec[dev_index].first, cmd_vec[dev_index].second); return HSA_STATUS_SUCCESS; } From f92bc03fa273f8419a92114e401aafe1407f2f91 Mon Sep 17 00:00:00 2001 From: Evgeny Date: Thu, 4 Apr 2019 21:29:57 -0500 Subject: [PATCH 050/153] 2.3 update --- bin/build_kernel.sh | 30 ++++ bin/rpl_run.sh | 2 + bin/run_tool.sh | 18 +- bin/tblextr.py | 4 +- inc/rocprofiler.h | 68 +++++++- src/core/context.h | 308 ++++++++++++++++++--------------- src/core/context_pool.h | 193 +++++++++++++++++++++ src/core/hsa_queue.h | 26 +-- src/core/intercept_queue.h | 4 +- src/core/metrics.h | 2 +- src/core/rocprofiler.cpp | 115 ++++++++---- src/core/tracker.h | 34 ++-- src/util/hsa_rsrc_factory.cpp | 173 +++++++++++++----- src/util/hsa_rsrc_factory.h | 76 ++++++-- src/util/logger.h | 21 ++- test/CMakeLists.txt | 4 +- test/app/intercept_test.cpp | 192 ++++++++++++++++---- test/run.sh | 49 +++++- test/tool/input1.xml | 13 +- test/tool/input2.xml | 5 + test/tool/tool.cpp | 4 +- test/util/hsa_rsrc_factory.cpp | 169 +++++++++++++----- test/util/hsa_rsrc_factory.h | 68 +++++++- 23 files changed, 1200 insertions(+), 378 deletions(-) create mode 100755 bin/build_kernel.sh create mode 100644 src/core/context_pool.h create mode 100644 test/tool/input2.xml diff --git a/bin/build_kernel.sh b/bin/build_kernel.sh new file mode 100755 index 00000000..6c4afe6f --- /dev/null +++ b/bin/build_kernel.sh @@ -0,0 +1,30 @@ +#!/bin/sh + +TEST_NAME=$1 +DST_DIR=$2 + +if [ -z "$TEST_NAME" ] ; then + echo "Usage: $0 " + echo " Will look for .cl and will build .so dynamic object library" + exit 1 +fi + +if [ -z "$DST_DIR" ] ; then + DST_DIR=$(dirname TEST_NAME) +fi + +GFXIP=$(/opt/rocm/bin/rocminfo | grep "amdgcn-amd-amdhsa--" | head -n 1 | sed -n "s/^.*amdgcn-amd-amdhsa--\(\w*\).*$/\1/p") +if [ -z "$GFXIP" ] ; then + echo "GPU is not found" + exit 1 +fi + +OBJ_PREF=$(echo $GFXIP | head -c 4) +OBJ_NAME=$(echo "_$(basename $TEST_NAME)" | sed -e 's/_./\U&\E/g' -e 's/_//g') +OBJ_FILE=${OBJ_PREF}_${OBJ_NAME}.hsaco + +/opt/rocm/opencl/bin/x86_64/clang -cl-std=CL2.0 -cl-std=CL2.0 -include /opt/rocm/opencl/include/opencl-c.h -Xclang -mlink-bitcode-file -Xclang /opt/rocm/opencl/lib/x86_64/bitcode/opencl.amdgcn.bc -Xclang -mlink-bitcode-file -Xclang /opt/rocm/opencl/lib/x86_64/bitcode/ockl.amdgcn.bc -target amdgcn-amd-amdhsa -mcpu=$GFXIP -mno-code-object-v3 $TEST_NAME.cl -o $OBJ_FILE + +echo "'$OBJ_FILE' is generated for '$GFXIP'" + +exit 0 diff --git a/bin/rpl_run.sh b/bin/rpl_run.sh index d94ee76f..78edf446 100755 --- a/bin/rpl_run.sh +++ b/bin/rpl_run.sh @@ -142,6 +142,7 @@ usage() { echo "" echo " --stats - generating kernel execution stats, file .stats.csv" echo " --hsa-trace - to trace HSA, generates API execution stats and JSON file chrome-tracing compatible" + echo " --hip-trace - to trace HIP, generates API execution stats and JSON file chrome-tracing compatible" echo " Generated files: .hsa_stats.txt .json" echo " Traced API list can be set by input .txt or .xml files." echo " Input .txt:" @@ -302,6 +303,7 @@ while [ 1 ] ; do HSA_TRACE=1 elif [ "$1" = "--hip-trace" ] ; then ARG_VAL=0 + export ROCP_TIMESTAMP_ON=1 GEN_STATS=1 HIP_TRACE=1 elif [ "$1" = "--verbose" ] ; then diff --git a/bin/run_tool.sh b/bin/run_tool.sh index 5ee438c0..ed1609fa 100755 --- a/bin/run_tool.sh +++ b/bin/run_tool.sh @@ -1,26 +1,27 @@ #!/bin/sh BIN_DIR=`dirname $0` -BIN_DIR=`cd $BIN_DIR; pwd` -PKG_DIR=`echo $BIN_DIR | sed "s/\/bin\/*//"` -BIN_DIR=$PKG_DIR/bin +BIN_DIR=`realpath $BIN_DIR` +PKG_DIR=${BIN_DIR%/bin} # PATH to custom HSA libs HSA_PATH=$PKG_DIR/lib/hsa if [ -z "$1" ] ; then echo "Usage: $0 " -else + exit 1 +fi + # profiler plugin library test_app=$* # paths to ROC profiler and oher libraries -export LD_LIBRARY_PATH=$PKG_DIR/lib:$PKG_DIR/tool:$HSA_PATH +export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$PKG_DIR/lib:$PKG_DIR/tool:$HSA_PATH export PATH=.:$PATH # ROC profiler library loaded by HSA runtime export HSA_TOOLS_LIB=librocprofiler64.so.1 # tool library loaded by ROC profiler -if [ -z $ROCP_TOOL_LIB ] ; then +if [ -z "$ROCP_TOOL_LIB" ] ; then export ROCP_TOOL_LIB=libintercept_test.so fi # enable error messages @@ -30,7 +31,8 @@ export ROCPROFILER_LOG=1 # ROC profiler metrics config file unset ROCP_PROXY_QUEUE # ROC profiler metrics config file -export ROCP_METRICS=$BIN_DIR/lib/metrics.xml +if [ -z "$ROCP_METRICS" ] ; then + export ROCP_METRICS=$PKG_DIR/lib/metrics.xml +fi LD_PRELOAD=$ROCP_TOOL_LIB $test_app -fi diff --git a/bin/tblextr.py b/bin/tblextr.py index 4c4cc782..329ab0d8 100755 --- a/bin/tblextr.py +++ b/bin/tblextr.py @@ -308,7 +308,9 @@ def fill_ops_db(table_name, db, indir): db.insert_entry(table_handle, rec_vals) filtr[corr_id] = 1 - if not gpu_pid in dep_dict: dep_dict[gpu_pid] = {} + if not gpu_pid in dep_dict: + dep_dict[gpu_pid] = {} + dep_dict[gpu_pid]['to'] = {} dep_dict[gpu_pid]['to'][corr_id] = int(rec_vals[0]) / 1000 dep_dict[gpu_pid]['bsp'] = OPS_PID else: fatal("async-copy bad record") diff --git a/inc/rocprofiler.h b/inc/rocprofiler.h index 5449204b..1e74c464 100644 --- a/inc/rocprofiler.h +++ b/inc/rocprofiler.h @@ -26,8 +26,9 @@ THE SOFTWARE. // // The goal of the implementation is to provide a HW specific low-level // performance analysis interface for profiling of GPU compute applications. -// The profiling includes HW performance counters with derived -// performance metrics. +// The profiling includes HW performance counters (PMC) with complex +// performance metrics and thread traces (SQTT). The profiling is supported +// by the SQTT, PMC and Callback APIs. // // The library can be used by a tool library loaded by HSA runtime or by // higher level HW independent performance analysis API like PAPI. @@ -46,7 +47,7 @@ THE SOFTWARE. #include #include -#define ROCPROFILER_VERSION_MAJOR 6 +#define ROCPROFILER_VERSION_MAJOR 7 #define ROCPROFILER_VERSION_MINOR 0 #ifdef __cplusplus @@ -219,6 +220,7 @@ typedef struct { const hsa_queue_t* queue; // HSA queue uint64_t queue_index; // Index in the queue uint32_t queue_id; // Queue id + hsa_signal_t completion_signal; // Completion signal const hsa_kernel_dispatch_packet_t* packet; // HSA dispatch packet const char* kernel_name; // Kernel name uint64_t kernel_object; // Kernel object pointer @@ -381,6 +383,66 @@ hsa_status_t rocprofiler_queue_create_profiled( void* data, uint32_t private_segment_size, uint32_t group_segment_size, hsa_queue_t** queue); +//////////////////////////////////////////////////////////////////////////////// +// Profiling pool +// +// Support for profiling contexts pool +// The API provide capability to create a contexts pool for a given agent and a set of features, +// to fetch/relase a context entry, to register a callback for the contexts completion. + +// Profiling pool handle +typedef void rocprofiler_pool_t; + +// Profiling pool entry +typedef struct { + rocprofiler_t* context; // context object + void* payload; // payload data object +} rocprofiler_pool_entry_t; + +// Profiling handler, calling on profiling completion +typedef bool (*rocprofiler_pool_handler_t)(const rocprofiler_pool_entry_t* entry, void* arg); + +// Profiling preperties +typedef struct { + uint32_t num_entries; // pool size entries + uint32_t payload_bytes; // payload size bytes + rocprofiler_pool_handler_t handler; // handler on context completion + void* handler_arg; // the handler arg +} rocprofiler_pool_properties_t; + +// Open profiling pool +hsa_status_t rocprofiler_pool_open( + hsa_agent_t agent, // GPU handle + rocprofiler_feature_t* features, // [in] profiling features array + uint32_t feature_count, // profiling info count + rocprofiler_pool_t** pool, // [out] context object + uint32_t mode, // profiling mode mask + rocprofiler_pool_properties_t*); // pool properties + +// Close profiling pool +hsa_status_t rocprofiler_pool_close( + rocprofiler_pool_t* pool); // profiling pool handle + +// Fetch profiling pool entry +hsa_status_t rocprofiler_pool_fetch( + rocprofiler_pool_t* pool, // profiling pool handle + rocprofiler_pool_entry_t* entry); // [out] empty profiling pool entry + +// Release profiling pool entry +hsa_status_t rocprofiler_pool_release( + rocprofiler_pool_entry_t* entry); // released profiling pool entry + +// Iterate fetched profiling pool entries +hsa_status_t rocprofiler_pool_iterate( + rocprofiler_pool_t* pool, // profiling pool handle + hsa_status_t (*callback)(rocprofiler_pool_entry_t* entry, void* data), // callback + void *data); // [in/out] data passed to callback + +// Flush completed entries in profiling pool +hsa_status_t rocprofiler_pool_flush( + rocprofiler_pool_t* pool); // profiling pool handle + +//////////////////////////////////////////////////////////////////////////////// #ifdef __cplusplus } // extern "C" block #endif // __cplusplus diff --git a/src/core/context.h b/src/core/context.h index 6eb391a8..a59effd0 100644 --- a/src/core/context.h +++ b/src/core/context.h @@ -153,149 +153,31 @@ class Context { public: typedef std::map info_map_t; - Context(const util::AgentInfo* agent_info, Queue* queue, rocprofiler_feature_t* info, - const uint32_t info_count, rocprofiler_handler_t handler, void* handler_arg) - : agent_(agent_info->dev_id), - agent_info_(agent_info), - queue_(queue), - hsa_rsrc_(&util::HsaRsrcFactory::Instance()), - api_(hsa_rsrc_->AqlProfileApi()), - metrics_(NULL), - handler_(handler), - handler_arg_(handler_arg) + static void Create(Context* obj, const util::AgentInfo* agent_info, Queue* queue, rocprofiler_feature_t* info, + const uint32_t info_count, rocprofiler_handler_t handler, void* handler_arg) { - if (info_count == 0) { - set_.push_back(Group(agent_info_, this, 0)); - return; - } - - metrics_ = MetricsDict::Create(agent_info); - if (metrics_ == NULL) EXC_RAISING(HSA_STATUS_ERROR, "MetricsDict create failed"); - - if (Initialize(info, info_count) == false) { - fprintf(stdout, "\nInput metrics out of HW limit. Proposed metrics group set:\n"); fflush(stdout); - MetricsGroupSet(agent_info, info, info_count).Print(stdout); - fprintf(stdout, "\n"); fflush(stdout); - EXC_RAISING(HSA_STATUS_ERROR, "Metrics list exceeds HW limits"); - } - Finalize(); - - if (handler != NULL) { - for (unsigned group_index = 0; group_index < set_.size(); ++group_index) { - set_[group_index].ResetRefsCount(); - const profile_vector_t profile_vector = GetProfiles(group_index); - for (auto& tuple : profile_vector) { - // Handler for stop packet completion - hsa_amd_signal_async_handler(tuple.completion_signal, HSA_SIGNAL_CONDITION_LT, 1, Handler, - &set_[group_index]); - } - } - } - } - - ~Context() { - for (const auto& v : info_map_) { - const std::string& name = v.first; - const rocprofiler_feature_t* info = v.second; - if ((info->kind == ROCPROFILER_FEATURE_KIND_METRIC) && - (metrics_map_.find(name) == metrics_map_.end())) { - delete info; - } - } + new (obj) Context(agent_info, queue, info, info_count, handler, handler_arg); + obj->Construct(agent_info, queue, info, info_count, handler, handler_arg); } - // Initialize rocprofiler context - bool Initialize(rocprofiler_feature_t* info_array, const uint32_t info_count) { - // Register input features to not duplicate by features referencing - for (unsigned i = 0; i < info_count; ++i) { - rocprofiler_feature_t* info = &info_array[i]; - if (!info->name) EXC_RAISING(HSA_STATUS_ERROR, "input feature name is NULL"); - info_map_[info->name] = info; - } - - // Adding zero group, always present - if (info_count) set_.push_back(Group(agent_info_, this, 0)); - - // Processing input features - for (unsigned i = 0; i < info_count; ++i) { - rocprofiler_feature_t* info = &info_array[i]; - const rocprofiler_feature_kind_t kind = info->kind; - const char* name = info->name; - - if (kind == ROCPROFILER_FEATURE_KIND_METRIC) { // Processing metrics features - const Metric* metric = metrics_->Get(name); - if (metric == NULL) - EXC_RAISING(HSA_STATUS_ERROR, "input metric '" << name << "' is not found"); -#if 0 - std::cout << " " << name << (metric->GetExpr() ? " = " + metric->GetExpr()->String() : " counter") << std::endl; -#endif + static void Release(Context* obj) { obj->Destruct(); } - auto ret = metrics_map_.insert({name, metric}); - if (!ret.second) - EXC_RAISING(HSA_STATUS_ERROR, "input metric '" << name - << "' is registered more then once"); - - counters_vec_t counters_vec = metric->GetCounters(); - if (counters_vec.empty()) - EXC_RAISING(HSA_STATUS_ERROR, "bad metric '" << name << "' is empty"); - - for (const counter_t* counter : counters_vec) { - // For metrics expressions checking that there is no the same counter in the input metrics - // and also that the counter wasn't registered already by another input metric expression - if (metric->GetExpr()) { - if (info_map_.find(counter->name) != info_map_.end()) { - continue; - } else { - info = NewCounterInfo(counter); - info_map_[info->name] = info; - } - } - - const event_t* event = &(counter->event); - const block_des_t block_des = {event->block_name, event->block_index}; - auto ret = groups_map_.insert({block_des, {}}); - block_status_t& block_status = ret.first->second; - if (block_status.max_counters == 0) { - profile_t query = {}; - query.agent = agent_; - query.type = HSA_VEN_AMD_AQLPROFILE_EVENT_TYPE_PMC; - query.events = event; - - uint32_t block_counters; - hsa_status_t status = api_->hsa_ven_amd_aqlprofile_get_info( - &query, HSA_VEN_AMD_AQLPROFILE_INFO_BLOCK_COUNTERS, &block_counters); - if (status != HSA_STATUS_SUCCESS) AQL_EXC_RAISING(status, "get block_counters info"); - block_status.max_counters = block_counters; - } - if (block_status.counter_index >= block_status.max_counters) { - return false; - - block_status.counter_index = 0; - block_status.group_index += 1; - } - block_status.counter_index += 1; - if (block_status.group_index >= set_.size()) { - set_.push_back(Group(agent_info_, this, block_status.group_index)); - } - const uint32_t group_index = block_status.group_index; - set_[group_index].Insert(profile_info_t{event, NULL, 0, info}); - } - } else if (kind == ROCPROFILER_FEATURE_KIND_TRACE) { // Processing traces features - set_[0].Insert(profile_info_t{NULL, info->parameters, info->parameter_count, info}); - } else { - EXC_RAISING(HSA_STATUS_ERROR, "bad rocprofiler feature kind (" << kind << ")"); - } + static Context* Create(const util::AgentInfo* agent_info, Queue* queue, rocprofiler_feature_t* info, + const uint32_t info_count, rocprofiler_handler_t handler, void* handler_arg) + { + Context* obj = new Context(agent_info, queue, info, info_count, handler, handler_arg); + if (obj == NULL) EXC_RAISING(HSA_STATUS_ERROR, "allocation error"); + try { + obj->Construct(agent_info, queue, info, info_count, handler, handler_arg); + } catch(...) { + delete obj; + obj = NULL; + throw; } - - return true; + return obj; } - void Finalize() { - for (unsigned index = 0; index < set_.size(); ++index) { - const hsa_status_t status = set_[index].Finalize(); - if (status != HSA_STATUS_SUCCESS) EXC_RAISING(status, "context finalize failed"); - } - } + static void Destroy(Context* obj) { if (obj != NULL) delete obj; } void Reset(const uint32_t& group_index) { set_[group_index].ResetRefsCount(); } @@ -415,6 +297,160 @@ class Context { rocprofiler_handler_t GetHandler(void** arg) const { *arg = handler_arg_; return handler_; } private: + Context(const util::AgentInfo* agent_info, Queue* queue, rocprofiler_feature_t* info, + const uint32_t info_count, rocprofiler_handler_t handler, void* handler_arg) + : agent_(agent_info->dev_id), + agent_info_(agent_info), + queue_(queue), + hsa_rsrc_(&util::HsaRsrcFactory::Instance()), + api_(hsa_rsrc_->AqlProfileApi()), + metrics_(NULL), + handler_(handler), + handler_arg_(handler_arg) + {} + + ~Context() { Destruct(); } + + void Destruct() { + for (const auto& v : info_map_) { + const std::string& name = v.first; + const rocprofiler_feature_t* info = v.second; + if ((info->kind == ROCPROFILER_FEATURE_KIND_METRIC) && + (metrics_map_.find(name) == metrics_map_.end())) { + delete info; + } + } + } + + void Construct(const util::AgentInfo* agent_info, Queue* queue, rocprofiler_feature_t* info, + const uint32_t info_count, rocprofiler_handler_t handler, void* handler_arg) + { + if (info_count == 0) { + set_.push_back(Group(agent_info_, this, 0)); + return; + } + + metrics_ = MetricsDict::Create(agent_info); + if (metrics_ == NULL) EXC_RAISING(HSA_STATUS_ERROR, "MetricsDict create failed"); + + if (Initialize(info, info_count) == false) { + fprintf(stdout, "\nInput metrics out of HW limit. Proposed metrics group set:\n"); fflush(stdout); + MetricsGroupSet(agent_info, info, info_count).Print(stdout); + fprintf(stdout, "\n"); fflush(stdout); + EXC_RAISING(HSA_STATUS_ERROR, "Metrics list exceeds HW limits"); + } + Finalize(); + + if (handler != NULL) { + for (unsigned group_index = 0; group_index < set_.size(); ++group_index) { + set_[group_index].ResetRefsCount(); + const profile_vector_t profile_vector = GetProfiles(group_index); + for (auto& tuple : profile_vector) { + // Handler for stop packet completion + hsa_amd_signal_async_handler(tuple.completion_signal, HSA_SIGNAL_CONDITION_LT, 1, Handler, + &set_[group_index]); + } + } + } + } + + // Initialize rocprofiler context + bool Initialize(rocprofiler_feature_t* info_array, const uint32_t info_count) { + // Register input features to not duplicate by features referencing + for (unsigned i = 0; i < info_count; ++i) { + rocprofiler_feature_t* info = &info_array[i]; + const rocprofiler_feature_kind_t kind = info->kind; + const char* name = info->name; + if (!name) EXC_RAISING(HSA_STATUS_ERROR, "input feature name is NULL"); + info_map_[name] = info; + if (kind == ROCPROFILER_FEATURE_KIND_METRIC) { + auto ret = metrics_map_.insert({name, NULL}); + if (!ret.second) + EXC_RAISING(HSA_STATUS_ERROR, "input metric '" << name + << "' is registered more then once"); + } + } + + // Adding zero group, always present + if (info_count) set_.push_back(Group(agent_info_, this, 0)); + + // Processing input features + for (unsigned i = 0; i < info_count; ++i) { + rocprofiler_feature_t* info = &info_array[i]; + const rocprofiler_feature_kind_t kind = info->kind; + const char* name = info->name; + + if (kind == ROCPROFILER_FEATURE_KIND_METRIC) { // Processing metrics features + const Metric* metric = metrics_->Get(name); + if (metric == NULL) + EXC_RAISING(HSA_STATUS_ERROR, "input metric '" << name << "' is not found"); +#if 0 + std::cout << " " << name << (metric->GetExpr() ? " = " + metric->GetExpr()->String() : " counter") << std::endl; +#endif + + metrics_map_[name] = metric; + counters_vec_t counters_vec = metric->GetCounters(); + if (counters_vec.empty()) + EXC_RAISING(HSA_STATUS_ERROR, "bad metric '" << name << "' is empty"); + + for (const counter_t* counter : counters_vec) { + // For metrics expressions checking that there is no the same counter in the input metrics + // and also that the counter wasn't registered already by another input metric expression + if (metric->GetExpr()) { + if (info_map_.find(counter->name) != info_map_.end()) { + continue; + } else { + info = NewCounterInfo(counter); + info_map_[info->name] = info; + } + } + + const event_t* event = &(counter->event); + const block_des_t block_des = {event->block_name, event->block_index}; + auto ret = groups_map_.insert({block_des, {}}); + block_status_t& block_status = ret.first->second; + if (block_status.max_counters == 0) { + profile_t query = {}; + query.agent = agent_; + query.type = HSA_VEN_AMD_AQLPROFILE_EVENT_TYPE_PMC; + query.events = event; + + uint32_t block_counters; + hsa_status_t status = api_->hsa_ven_amd_aqlprofile_get_info( + &query, HSA_VEN_AMD_AQLPROFILE_INFO_BLOCK_COUNTERS, &block_counters); + if (status != HSA_STATUS_SUCCESS) AQL_EXC_RAISING(status, "get block_counters info"); + block_status.max_counters = block_counters; + } + if (block_status.counter_index >= block_status.max_counters) { + return false; + + block_status.counter_index = 0; + block_status.group_index += 1; + } + block_status.counter_index += 1; + if (block_status.group_index >= set_.size()) { + set_.push_back(Group(agent_info_, this, block_status.group_index)); + } + const uint32_t group_index = block_status.group_index; + set_[group_index].Insert(profile_info_t{event, NULL, 0, info}); + } + } else if (kind == ROCPROFILER_FEATURE_KIND_TRACE) { // Processing traces features + set_[0].Insert(profile_info_t{NULL, info->parameters, info->parameter_count, info}); + } else { + EXC_RAISING(HSA_STATUS_ERROR, "bad rocprofiler feature kind (" << kind << ")"); + } + } + + return true; + } + + void Finalize() { + for (unsigned index = 0; index < set_.size(); ++index) { + const hsa_status_t status = set_[index].Finalize(); + if (status != HSA_STATUS_SUCCESS) EXC_RAISING(status, "context finalize failed"); + } + } + // Getting profling packets profile_vector_t GetProfiles(const uint32_t& index) { profile_vector_t vec; diff --git a/src/core/context_pool.h b/src/core/context_pool.h new file mode 100644 index 00000000..3056cccc --- /dev/null +++ b/src/core/context_pool.h @@ -0,0 +1,193 @@ +/****************************************************************************** +Copyright (c) 2018 Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*******************************************************************************/ + +#ifndef SRC_CORE_CONTEXT_POOL_H_ +#define SRC_CORE_CONTEXT_POOL_H_ + +#include "inc/rocprofiler.h" + +#include + +#include "core/context.h" + +namespace rocprofiler { +class ContextPool { + public: + typedef uint64_t index_t; + typedef std::mutex mutex_t; + + struct entry_t { + ContextPool* pool; + Context* context; + std::atomic completed; + }; + + static ContextPool* Create( + uint32_t num_entries, + uint32_t payload_bytes, + const util::AgentInfo* agent_info, + rocprofiler_feature_t* info, + const uint32_t info_count, + rocprofiler_pool_handler_t handler, + void* handler_arg) + { + ContextPool* obj = new ContextPool(num_entries, payload_bytes, agent_info, info, info_count, handler, handler_arg); + if (obj == NULL) EXC_RAISING(HSA_STATUS_ERROR, "allocation error"); + return obj; + } + + static void Destroy(ContextPool* pool) { delete pool; } + + void Fetch(rocprofiler_pool_entry_t* pool_entry) { + if (constructed_ == false) { + Construct(agent_info_, info_, info_count_); + } + const index_t write_index = write_index_.fetch_add(entry_size_bytes_, std::memory_order_relaxed); + while (write_index >= (read_index_.load(std::memory_order_acquire) + array_size_bytes_)) { + check_completed(); + std::this_thread::yield(); + } + entry_t* entry = GetPoolEntry(write_index, pool_entry); + if (entry->completed.load(std::memory_order_relaxed) != false) EXC_RAISING(HSA_STATUS_ERROR, "Corrupted pool entry"); + } + + void Flush() { + check_completed(); + } + + private: + static unsigned aligned64(const unsigned& size) { return (size + 0x3f) & ~0x3fu; } + + static bool context_handler(rocprofiler_group_t group, void* arg) { + entry_t* entry = reinterpret_cast(arg); + entry->completed.store(true, std::memory_order_release); + entry->pool->check_completed(); + return true; + } + + ContextPool( + uint32_t num_entries, + uint32_t payload_bytes, + const util::AgentInfo* agent_info, + rocprofiler_feature_t* info, + const uint32_t info_count, + rocprofiler_pool_handler_t pool_handler, + void* pool_handler_arg + ) : + payload_off_(aligned64(sizeof(entry_t))), + entry_size_bytes_(payload_off_ + aligned64(payload_bytes)), + array_size_bytes_(entry_size_bytes_ * num_entries), + array_(NULL), + read_index_(0), + write_index_(0), + sync_flag_(false), + + agent_info_(agent_info), + info_(info), + info_count_(info_count), + pool_handler_(pool_handler), + pool_handler_arg_(pool_handler_arg), + constructed_(false) + {} + + void Construct(const util::AgentInfo* agent_info, rocprofiler_feature_t* info, const uint32_t info_count) { + std::lock_guard lck(mutex_); + + if (constructed_ == false) { + array_data_ = (char*) malloc(array_size_bytes_ + 0x3f); + array_ = reinterpret_cast(((intptr_t)array_data_ + 0x3f) >> 6 << 6); + if (((intptr_t)array_ & 0x3f) != 0) EXC_RAISING(HSA_STATUS_ERROR, "Pool array is not aligned"); + memset(array_, 0, array_size_bytes_); + + const char* end = array_ + array_size_bytes_; + for (char* ptr = array_; ptr < end; ptr += entry_size_bytes_) { + entry_t* entry = reinterpret_cast(ptr); + entry->pool = this; + entry->context = Context::Create(agent_info, NULL, info, info_count, ContextPool::context_handler, ptr); + } + + constructed_ = true; + } + } + + ~ContextPool() { + const char* end = array_ + array_size_bytes_; + for (char* ptr = array_; ptr < end; ptr += entry_size_bytes_) { + entry_t* entry = reinterpret_cast(ptr); + Context::Destroy(entry->context); + } + free(array_); + } + + char* GetArrayPtr(const uint32_t& index) { return array_ + (index % array_size_bytes_); } + + entry_t* GetPoolEntry(const uint32_t& index, rocprofiler_pool_entry_t* pool_entry) { + char* ptr = GetArrayPtr(index); + entry_t* entry = reinterpret_cast(ptr); + void* payload = ptr + payload_off_; + *pool_entry = rocprofiler_pool_entry_t{}; + pool_entry->context = reinterpret_cast(entry->context); + pool_entry->payload = payload; + return entry; + } + + void check_completed() { + if (sync_flag_.test_and_set(std::memory_order_acquire) == false) { + index_t read_index = read_index_.load(std::memory_order_relaxed); + const index_t write_index = write_index_.load(std::memory_order_relaxed); + while(read_index < write_index) { + rocprofiler_pool_entry_t pool_entry{}; + entry_t* entry = GetPoolEntry(read_index, &pool_entry); + if (entry->completed.load(std::memory_order_acquire) == true) { + pool_handler_(&pool_entry, pool_handler_arg_); + entry->completed.store(false, std::memory_order_relaxed); + read_index += entry_size_bytes_; + read_index_.store(read_index, std::memory_order_release); + } else { + break; + } + } + sync_flag_.clear(std::memory_order_release); + } + } + + const uint32_t payload_off_; + const uint32_t entry_size_bytes_; + const uint32_t array_size_bytes_; + char* array_data_; + char* array_; + volatile std::atomic read_index_; + volatile std::atomic write_index_; + volatile std::atomic_flag sync_flag_; + + const util::AgentInfo* agent_info_; + rocprofiler_feature_t* info_; + const uint32_t info_count_; + rocprofiler_pool_handler_t pool_handler_; + void* pool_handler_arg_; + + bool constructed_; + mutex_t mutex_; +}; +} // namespace rocprofiler + +#endif // SRC_CORE_CONTEXT_POOL_H_ diff --git a/src/core/hsa_queue.h b/src/core/hsa_queue.h index 12ef97bb..7c7d96c6 100644 --- a/src/core/hsa_queue.h +++ b/src/core/hsa_queue.h @@ -35,31 +35,7 @@ class HsaQueue : public Queue { HsaQueue(const util::AgentInfo* agent_info, hsa_queue_t* queue) : queue_(queue) {} void Submit(const packet_t* packet) { - // Compute the write index of queue and copy Aql packet into it - const uint64_t que_idx = hsa_queue_load_write_index_relaxed(queue_); - // Increment the write index - hsa_queue_store_write_index_relaxed(queue_, que_idx + 1); - - const uint32_t mask = queue_->size - 1; - - // Copy packet to the queue - const packet_word_t* src = reinterpret_cast(packet); - packet_t* slot = reinterpret_cast(queue_->base_address) + (que_idx & mask); - packet_word_t* dst = reinterpret_cast(slot); - const uint32_t nwords = sizeof(packet_t) / sizeof(packet_word_t); - for (unsigned i = 1; i < nwords; ++i) { - dst[i] = src[i]; - } - - // To maintain global order to ensure the prior copy of the packet contents is made visible - // before the header is updated. - // With in-order CP it will wait until the first packet in the blob will be valid - std::atomic* header_atomic_ptr = - reinterpret_cast*>(&dst[0]); - header_atomic_ptr->store(src[0], std::memory_order_release); - - // Doorbell signaling - hsa_signal_store_relaxed(queue_->doorbell_signal, que_idx); + rocprofiler::util::HsaRsrcFactory::Instance().Submit(queue_, packet); } private: diff --git a/src/core/intercept_queue.h b/src/core/intercept_queue.h index e41dcd0f..5a6234ab 100644 --- a/src/core/intercept_queue.h +++ b/src/core/intercept_queue.h @@ -72,7 +72,7 @@ class InterceptQueue { if (tracker_on || tracker_on_) { if (tracker_ == NULL) tracker_ = &Tracker::Instance(); - status = hsa_amd_profiling_set_profiler_enabled(*queue, true); + status = rocprofiler::util::HsaRsrcFactory::HsaApi()->hsa_amd_profiling_set_profiler_enabled(*queue, true); if (status != HSA_STATUS_SUCCESS) EXC_ABORT(status, "hsa_amd_profiling_set_profiler_enabled()"); } @@ -138,6 +138,7 @@ class InterceptQueue { if ((GetHeaderType(packet) == HSA_PACKET_TYPE_KERNEL_DISPATCH) && (dispatch_callback_ != NULL)) { const hsa_kernel_dispatch_packet_t* dispatch_packet = reinterpret_cast(packet); + const hsa_signal_t completion_signal = dispatch_packet->completion_signal; // Adding kernel timing tracker Tracker::entry_t* tracker_entry = NULL; @@ -155,6 +156,7 @@ class InterceptQueue { obj->queue_, user_que_idx, obj->queue_id, + completion_signal, dispatch_packet, kernel_name, kernel_symbol, diff --git a/src/core/metrics.h b/src/core/metrics.h index 46806dcf..cb55d189 100644 --- a/src/core/metrics.h +++ b/src/core/metrics.h @@ -170,7 +170,7 @@ class MetricsDict { const_iterator_t End() const { return cache_.end(); } xml::Xml::nodes_t GetNodes(const std::string& scope) const { - return xml_->GetNodes("top." + scope + ".metric"); + return (xml_ != NULL) ? xml_->GetNodes("top." + scope + ".metric") : xml::Xml::nodes_t(); } private: diff --git a/src/core/rocprofiler.cpp b/src/core/rocprofiler.cpp index b4ba5d4a..de16fa19 100644 --- a/src/core/rocprofiler.cpp +++ b/src/core/rocprofiler.cpp @@ -28,6 +28,7 @@ THE SOFTWARE. #include #include "core/context.h" +#include "core/context_pool.h" #include "core/hsa_queue.h" #include "core/intercept_queue.h" #include "core/proxy_queue.h" @@ -89,6 +90,8 @@ decltype(hsa_amd_memory_async_copy_rect)* hsa_amd_memory_async_copy_rect_fn; ::HsaApiTable* kHsaApiTable; void SaveHsaApi(::HsaApiTable* table) { + util::HsaRsrcFactory::InitHsaApiTable(table); + kHsaApiTable = table; hsa_queue_create_fn = table->core_->hsa_queue_create_fn; hsa_queue_destroy_fn = table->core_->hsa_queue_destroy_fn; @@ -230,11 +233,12 @@ hsa_status_t GetExcStatus(const std::exception& e) { : HSA_STATUS_ERROR; } -inline size_t CreateEnableCmd(const rocprofiler::util::AgentInfo* agent_info, packet_t* command, const size_t& slot_count) { - const bool is_legacy = (strncmp(agent_info->name, "gfx8", 4) == 0); - const size_t packet_count = (is_legacy) ? Profile::LEGACY_SLOT_SIZE_PKT : 1; +inline size_t CreateEnableCmd(const hsa_agent_t& agent, packet_t* command, const size_t& slot_count) { rocprofiler::util::HsaRsrcFactory* hsa_rsrc = &rocprofiler::util::HsaRsrcFactory::Instance(); + const rocprofiler::util::AgentInfo* agent_info = hsa_rsrc->GetAgentInfo(agent); + const bool is_legacy = (strncmp(agent_info->name, "gfx8", 4) == 0); + const size_t packet_count = (is_legacy) ? Profile::LEGACY_SLOT_SIZE_PKT : 1; if (packet_count > slot_count) EXC_RAISING(HSA_STATUS_ERROR, "packet_count > slot_count"); @@ -287,13 +291,9 @@ hsa_status_t CreateQueuePro( uint32_t group_segment_size, hsa_queue_t **queue) { - typedef std::pair cmd_entry_t; - typedef std::vector cmd_vec_t; - static cmd_vec_t cmd_vec; - static uint32_t cmd_mask = 0; - static std::mutex cmd_mutex; - - rocprofiler::util::HsaRsrcFactory* hsa_rsrc = &rocprofiler::util::HsaRsrcFactory::Instance(); + static packet_t enable_cmd_packet[Profile::LEGACY_SLOT_SIZE_PKT]; + static size_t enable_cmd_size = 0; + static std::mutex enable_cmd_mutex; // Create HSA queue hsa_status_t status = hsa_queue_create_fn( @@ -308,30 +308,15 @@ hsa_status_t CreateQueuePro( if (status != HSA_STATUS_SUCCESS) return status; // Create 'Enable' cmd packet - const rocprofiler::util::AgentInfo* agent_info = hsa_rsrc->GetAgentInfo(agent); - const uint32_t dev_index = 1 << agent_info->dev_index; - const uint32_t dev_mask = 1 << dev_index; - if ((cmd_mask & dev_mask) == 0) { - std::lock_guard lck(cmd_mutex); - - if ((cmd_mask & dev_mask) == 0) { - cmd_mask |= dev_mask; - // Allocating cmd vector - uint32_t mask = 1; - while (1) { - const uint32_t max = 1 << cmd_vec.size(); - if (mask >= max) cmd_vec.push_back({}); - if (((mask & dev_mask) != 0) || (mask == 0)) break; - mask <<= 1; - } - if (mask == 0) EXC_RAISING(status, "bad device index (" << dev_index << ")"); - // Creating cmd packets - cmd_vec[dev_index].second = CreateEnableCmd(agent_info, cmd_vec[dev_index].first, Profile::LEGACY_SLOT_SIZE_PKT); + if (enable_cmd_size == 0) { + std::lock_guard lck(enable_cmd_mutex); + if (enable_cmd_size == 0) { + enable_cmd_size = CreateEnableCmd(agent, enable_cmd_packet, Profile::LEGACY_SLOT_SIZE_PKT); } } // Enable counters for the queue - rocprofiler::util::HsaRsrcFactory::Instance().Submit(*queue, cmd_vec[dev_index].first, cmd_vec[dev_index].second); + rocprofiler::util::HsaRsrcFactory::Instance().Submit(*queue, enable_cmd_packet, enable_cmd_size); return HSA_STATUS_SUCCESS; } @@ -383,11 +368,11 @@ hsa_status_t hsa_amd_memory_async_copy_rect_interceptor( rocprofiler_properties_t rocprofiler_properties; uint32_t SqttProfile::output_buffer_size_ = 0x2000000; // 32M bool SqttProfile::output_buffer_local_ = true; -Tracker* Tracker::instance_ = NULL; +std::atomic Tracker::instance_{}; Tracker::mutex_t Tracker::glob_mutex_; Tracker::counter_t Tracker::counter_ = 0; util::Logger::mutex_t util::Logger::mutex_; -util::Logger* util::Logger::instance_ = NULL; +std::atomic util::Logger::instance_{}; } /////////////////////////////////////////////////////////////////////////////////////////////////// @@ -494,8 +479,9 @@ PUBLIC_API hsa_status_t rocprofiler_open(hsa_agent_t agent, rocprofiler_feature_ } } - *handle = new rocprofiler::Context(agent_info, queue, features, feature_count, properties->handler, - properties->handler_arg); + rocprofiler::Context** context_ret = reinterpret_cast(handle); + *context_ret = rocprofiler::Context::Create(agent_info, queue, features, feature_count, + properties->handler, properties->handler_arg); API_METHOD_SUFFIX } @@ -503,7 +489,7 @@ PUBLIC_API hsa_status_t rocprofiler_open(hsa_agent_t agent, rocprofiler_feature_ PUBLIC_API hsa_status_t rocprofiler_close(rocprofiler_t* handle) { API_METHOD_PREFIX rocprofiler::Context* context = reinterpret_cast(handle); - if (context) delete context; + if (context) rocprofiler::Context::Destroy(context); API_METHOD_SUFFIX } @@ -625,6 +611,64 @@ PUBLIC_API hsa_status_t rocprofiler_iterate_trace_data( API_METHOD_SUFFIX } +//////////////////////////////////////////////////////////////////////////////// +// Open profiling pool +PUBLIC_API hsa_status_t rocprofiler_pool_open(hsa_agent_t agent, // GPU handle + rocprofiler_feature_t* features, // [in] profiling features array + uint32_t feature_count, // profiling info count + rocprofiler_pool_t** pool, // [out] context object + uint32_t mode, // profiling mode mask + rocprofiler_pool_properties_t* properties) // pool properties +{ + API_METHOD_PREFIX + rocprofiler::util::HsaRsrcFactory* hsa_rsrc = &rocprofiler::util::HsaRsrcFactory::Instance(); + const rocprofiler::util::AgentInfo* agent_info = hsa_rsrc->GetAgentInfo(agent); + if (agent_info == NULL) { + EXC_RAISING(HSA_STATUS_ERROR, "agent is not found"); + } + + rocprofiler::ContextPool* obj = rocprofiler::ContextPool::Create( + properties->num_entries, + properties->payload_bytes, + agent_info, + features, + feature_count, + properties->handler, + properties->handler_arg + ); + *pool = reinterpret_cast(obj); + API_METHOD_SUFFIX +} + +// Close profiling pool +PUBLIC_API hsa_status_t rocprofiler_pool_close(rocprofiler_pool_t* pool) // profiling pool handle +{ + API_METHOD_PREFIX + rocprofiler::ContextPool* obj = reinterpret_cast(pool); + rocprofiler::ContextPool::Destroy(obj); + API_METHOD_SUFFIX +} + +// Fetch profiling pool entry +PUBLIC_API hsa_status_t rocprofiler_pool_fetch(rocprofiler_pool_t* pool, // profiling pool handle + rocprofiler_pool_entry_t* entry) // [out] empty profling pool entry +{ + API_METHOD_PREFIX + rocprofiler::ContextPool* context_pool = reinterpret_cast(pool); + context_pool->Fetch(entry); + API_METHOD_SUFFIX +} + +// Fetch profiling pool entry +PUBLIC_API hsa_status_t rocprofiler_pool_flush(rocprofiler_pool_t* pool) // profiling pool handle +{ + API_METHOD_PREFIX + rocprofiler::ContextPool* context_pool = reinterpret_cast(pool); + context_pool->Flush(); + API_METHOD_SUFFIX +} + +//////////////////////////////////////////////////////////////////////////////// // Return the info for a given info kind PUBLIC_API hsa_status_t rocprofiler_get_info( const hsa_agent_t *agent, @@ -687,6 +731,7 @@ PUBLIC_API hsa_status_t rocprofiler_iterate_info( info.metric.name = strdup(name.c_str()); info.metric.description = strdup(descr.c_str()); info.metric.expr = expr.empty() ? NULL : strdup(expr.c_str()); + info.metric.instances = 1; if (expr.empty()) { // Getting the block name diff --git a/src/core/tracker.h b/src/core/tracker.h index 0cada86f..ffc06b85 100644 --- a/src/core/tracker.h +++ b/src/core/tracker.h @@ -66,13 +66,19 @@ class Tracker { static Tracker* Create() { std::lock_guard lck(glob_mutex_); - if (instance_ == NULL) instance_ = new Tracker; - return instance_; + Tracker* obj = instance_.load(std::memory_order_relaxed); + if (obj == NULL) { + obj = new Tracker; + if (obj == NULL) EXC_ABORT(HSA_STATUS_ERROR, "Tracker creation failed"); + instance_.store(obj, std::memory_order_release); + } + return obj; } static Tracker& Instance() { - if (instance_ == NULL) instance_ = Create(); - return *instance_; + Tracker* obj = instance_.load(std::memory_order_acquire); + if (obj == NULL) obj = Create(); + return *obj; } static void Destroy() { @@ -99,9 +105,9 @@ class Tracker { entry->record = record; // Creating a proxy signal - status = hsa_signal_create(1, 0, NULL, &(entry->signal)); + status = hsa_api_.hsa_signal_create(1, 0, NULL, &(entry->signal)); if (status != HSA_STATUS_SUCCESS) EXC_RAISING(status, "hsa_signal_create"); - status = hsa_amd_signal_async_handler(entry->signal, HSA_SIGNAL_CONDITION_LT, 1, Handler, entry); + status = hsa_api_.hsa_amd_signal_async_handler(entry->signal, HSA_SIGNAL_CONDITION_LT, 1, Handler, entry); if (status != HSA_STATUS_SUCCESS) EXC_RAISING(status, "hsa_amd_signal_async_handler"); // Adding antry to the list @@ -115,7 +121,7 @@ class Tracker { // Delete tracker entry void Delete(entry_t* entry) { - hsa_signal_destroy(entry->signal); + hsa_api_.hsa_signal_destroy(entry->signal); mutex_.lock(); sig_list_.erase(entry->it); mutex_.unlock(); @@ -151,7 +157,8 @@ class Tracker { private: Tracker() : outstanding_(0), - hsa_rsrc_(&(util::HsaRsrcFactory::Instance())) + hsa_rsrc_(&(util::HsaRsrcFactory::Instance())), + hsa_api_(*(hsa_rsrc_->HsaApi())) {} ~Tracker() { @@ -181,13 +188,13 @@ class Tracker { // Query begin/end and complete timestamps if (entry->is_memcopy) { hsa_amd_profiling_async_copy_time_t async_copy_time{}; - hsa_status_t status = hsa_amd_profiling_get_async_copy_time(entry->signal, &async_copy_time); + hsa_status_t status = hsa_api_.hsa_amd_profiling_get_async_copy_time(entry->signal, &async_copy_time); if (status != HSA_STATUS_SUCCESS) EXC_RAISING(status, "hsa_amd_profiling_get_async_copy_time"); record->begin = hsa_rsrc_->SysclockToNs(async_copy_time.start); record->end = hsa_rsrc_->SysclockToNs(async_copy_time.end); } else { hsa_amd_profiling_dispatch_time_t dispatch_time{}; - hsa_status_t status = hsa_amd_profiling_get_dispatch_time(entry->agent, entry->signal, &dispatch_time); + hsa_status_t status = hsa_api_.hsa_amd_profiling_get_dispatch_time(entry->agent, entry->signal, &dispatch_time); if (status != HSA_STATUS_SUCCESS) EXC_RAISING(status, "hsa_amd_profiling_get_dispatch_time"); record->begin = hsa_rsrc_->SysclockToNs(dispatch_time.start); record->end = hsa_rsrc_->SysclockToNs(dispatch_time.end); @@ -204,9 +211,9 @@ class Tracker { orig_signal_ptr->start_ts = prof_signal_ptr->start_ts; orig_signal_ptr->end_ts = prof_signal_ptr->end_ts; - const hsa_signal_value_t new_value = hsa_signal_load_relaxed(orig) - 1; + const hsa_signal_value_t new_value = hsa_api_.hsa_signal_load_relaxed(orig) - 1; if (signal_value != new_value) EXC_ABORT(HSA_STATUS_ERROR, "Tracker::Complete bad signal value"); - hsa_signal_store_screlease(orig, signal_value); + hsa_api_.hsa_signal_store_screlease(orig, signal_value); } } @@ -260,7 +267,7 @@ class Tracker { } // instance - static Tracker* instance_; + static std::atomic instance_; static mutex_t glob_mutex_; static counter_t counter_; @@ -273,6 +280,7 @@ class Tracker { std::atomic outstanding_; // HSA resources factory util::HsaRsrcFactory* hsa_rsrc_; + const util::hsa_pfn_t& hsa_api_; // Handling ordering enabled static const bool ordering_enabled_ = false; // Enable tracing diff --git a/src/util/hsa_rsrc_factory.cpp b/src/util/hsa_rsrc_factory.cpp index 2d64bae0..a47062dd 100644 --- a/src/util/hsa_rsrc_factory.cpp +++ b/src/util/hsa_rsrc_factory.cpp @@ -77,13 +77,13 @@ static hsa_status_t FindGlobalPool(hsa_amd_memory_pool_t pool, void* data, bool return HSA_STATUS_ERROR_INVALID_ARGUMENT; } - err = hsa_amd_memory_pool_get_info(pool, HSA_AMD_MEMORY_POOL_INFO_SEGMENT, &segment); + err = HsaRsrcFactory::HsaApi()->hsa_amd_memory_pool_get_info(pool, HSA_AMD_MEMORY_POOL_INFO_SEGMENT, &segment); CHECK_STATUS("hsa_amd_memory_pool_get_info", err); if (HSA_AMD_SEGMENT_GLOBAL != segment) { return HSA_STATUS_SUCCESS; } - err = hsa_amd_memory_pool_get_info(pool, HSA_AMD_MEMORY_POOL_INFO_GLOBAL_FLAGS, &flag); + err = HsaRsrcFactory::HsaApi()->hsa_amd_memory_pool_get_info(pool, HSA_AMD_MEMORY_POOL_INFO_GLOBAL_FLAGS, &flag); CHECK_STATUS("hsa_amd_memory_pool_get_info", err); uint32_t karg_st = flag & HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_KERNARG_INIT; @@ -117,14 +117,16 @@ HsaRsrcFactory::HsaRsrcFactory(bool initialize_hsa) : initialize_hsa_(initialize cpu_pool_ = NULL; kern_arg_pool_ = NULL; + InitHsaApiTable(NULL); + // Initialize the Hsa Runtime if (initialize_hsa_) { - status = hsa_init(); + status = hsa_api_.hsa_init(); CHECK_STATUS("Error in hsa_init", status); } // Discover the set of Gpu devices available on the platform - status = hsa_iterate_agents(GetHsaAgentsCallback, this); + status = hsa_api_.hsa_iterate_agents(GetHsaAgentsCallback, this); CHECK_STATUS("Error Calling hsa_iterate_agents", status); if (cpu_pool_ == NULL) CHECK_STATUS("CPU memory pool is not found", HSA_STATUS_ERROR); if (kern_arg_pool_ == NULL) CHECK_STATUS("Kern-arg memory pool is not found", HSA_STATUS_ERROR); @@ -134,17 +136,17 @@ HsaRsrcFactory::HsaRsrcFactory(bool initialize_hsa) : initialize_hsa_(initialize #ifdef ROCP_LD_AQLPROFILE status = LoadAqlProfileLib(&aqlprofile_api_); #else - status = hsa_system_get_major_extension_table(HSA_EXTENSION_AMD_AQLPROFILE, hsa_ven_amd_aqlprofile_VERSION_MAJOR, sizeof(aqlprofile_api_), &aqlprofile_api_); + status = hsa_api_.hsa_system_get_major_extension_table(HSA_EXTENSION_AMD_AQLPROFILE, hsa_ven_amd_aqlprofile_VERSION_MAJOR, sizeof(aqlprofile_api_), &aqlprofile_api_); #endif CHECK_STATUS("aqlprofile API table load failed", status); // Get Loader API table loader_api_ = {0}; - status = hsa_system_get_major_extension_table(HSA_EXTENSION_AMD_LOADER, 1, sizeof(loader_api_), &loader_api_); + status = hsa_api_.hsa_system_get_major_extension_table(HSA_EXTENSION_AMD_LOADER, 1, sizeof(loader_api_), &loader_api_); CHECK_STATUS("loader API table query failed", status); // Instantiate HSA timer - timer_ = new HsaTimer; + timer_ = new HsaTimer(&hsa_api_); CHECK_STATUS("HSA timer allocation failed", (timer_ == NULL) ? HSA_STATUS_ERROR : HSA_STATUS_SUCCESS); @@ -158,11 +160,95 @@ HsaRsrcFactory::~HsaRsrcFactory() { for (auto p : cpu_list_) delete p; for (auto p : gpu_list_) delete p; if (initialize_hsa_) { - hsa_status_t status = hsa_shut_down(); + hsa_status_t status = hsa_api_.hsa_shut_down(); CHECK_STATUS("Error in hsa_shut_down", status); } } +void HsaRsrcFactory::InitHsaApiTable(HsaApiTable* table) { + std::lock_guard lck(mutex_); + + if (hsa_api_.hsa_init == NULL) { + if (table != NULL) { + hsa_api_.hsa_init = table->core_->hsa_init_fn; + hsa_api_.hsa_shut_down = table->core_->hsa_shut_down_fn; + hsa_api_.hsa_agent_get_info = table->core_->hsa_agent_get_info_fn; + hsa_api_.hsa_iterate_agents = table->core_->hsa_iterate_agents_fn; + + hsa_api_.hsa_queue_create = table->core_->hsa_queue_create_fn; + hsa_api_.hsa_queue_destroy = table->core_->hsa_queue_destroy_fn; + hsa_api_.hsa_queue_load_write_index_relaxed = table->core_->hsa_queue_load_write_index_relaxed_fn; + hsa_api_.hsa_queue_store_write_index_relaxed = table->core_->hsa_queue_store_write_index_relaxed_fn; + hsa_api_.hsa_queue_load_read_index_relaxed = table->core_->hsa_queue_load_read_index_relaxed_fn; + + hsa_api_.hsa_signal_create = table->core_->hsa_signal_create_fn; + hsa_api_.hsa_signal_destroy = table->core_->hsa_signal_destroy_fn; + hsa_api_.hsa_signal_load_relaxed = table->core_->hsa_signal_load_relaxed_fn; + hsa_api_.hsa_signal_store_relaxed = table->core_->hsa_signal_store_relaxed_fn; + hsa_api_.hsa_signal_wait_scacquire = table->core_->hsa_signal_wait_scacquire_fn; + hsa_api_.hsa_signal_store_screlease = table->core_->hsa_signal_store_screlease_fn; + + hsa_api_.hsa_code_object_reader_create_from_file = table->core_->hsa_code_object_reader_create_from_file_fn; + hsa_api_.hsa_executable_create_alt = table->core_->hsa_executable_create_alt_fn; + hsa_api_.hsa_executable_load_agent_code_object = table->core_->hsa_executable_load_agent_code_object_fn; + hsa_api_.hsa_executable_freeze = table->core_->hsa_executable_freeze_fn; + hsa_api_.hsa_executable_get_symbol = table->core_->hsa_executable_get_symbol_fn; + + hsa_api_.hsa_system_get_info = table->core_->hsa_system_get_info_fn; + hsa_api_.hsa_system_get_major_extension_table = table->core_->hsa_system_get_major_extension_table_fn; + + hsa_api_.hsa_amd_agent_iterate_memory_pools = table->amd_ext_->hsa_amd_agent_iterate_memory_pools_fn; + hsa_api_.hsa_amd_memory_pool_get_info = table->amd_ext_->hsa_amd_memory_pool_get_info_fn; + hsa_api_.hsa_amd_memory_pool_allocate = table->amd_ext_->hsa_amd_memory_pool_allocate_fn; + hsa_api_.hsa_amd_agents_allow_access = table->amd_ext_->hsa_amd_agents_allow_access_fn; + hsa_api_.hsa_amd_memory_async_copy = table->amd_ext_->hsa_amd_memory_async_copy_fn; + + hsa_api_.hsa_amd_signal_async_handler = table->amd_ext_->hsa_amd_signal_async_handler_fn; + hsa_api_.hsa_amd_profiling_set_profiler_enabled = table->amd_ext_->hsa_amd_profiling_set_profiler_enabled_fn; + hsa_api_.hsa_amd_profiling_get_async_copy_time = table->amd_ext_->hsa_amd_profiling_get_async_copy_time_fn; + hsa_api_.hsa_amd_profiling_get_dispatch_time = table->amd_ext_->hsa_amd_profiling_get_dispatch_time_fn; + } else { + hsa_api_.hsa_init = hsa_init; + hsa_api_.hsa_shut_down = hsa_shut_down; + hsa_api_.hsa_agent_get_info = hsa_agent_get_info; + hsa_api_.hsa_iterate_agents = hsa_iterate_agents; + + hsa_api_.hsa_queue_create = hsa_queue_create; + hsa_api_.hsa_queue_destroy = hsa_queue_destroy; + hsa_api_.hsa_queue_load_write_index_relaxed = hsa_queue_load_write_index_relaxed; + hsa_api_.hsa_queue_store_write_index_relaxed = hsa_queue_store_write_index_relaxed; + hsa_api_.hsa_queue_load_read_index_relaxed = hsa_queue_load_read_index_relaxed; + + hsa_api_.hsa_signal_create = hsa_signal_create; + hsa_api_.hsa_signal_destroy = hsa_signal_destroy; + hsa_api_.hsa_signal_load_relaxed = hsa_signal_load_relaxed; + hsa_api_.hsa_signal_store_relaxed = hsa_signal_store_relaxed; + hsa_api_.hsa_signal_wait_scacquire = hsa_signal_wait_scacquire; + hsa_api_.hsa_signal_store_screlease = hsa_signal_store_screlease; + + hsa_api_.hsa_code_object_reader_create_from_file = hsa_code_object_reader_create_from_file; + hsa_api_.hsa_executable_create_alt = hsa_executable_create_alt; + hsa_api_.hsa_executable_load_agent_code_object = hsa_executable_load_agent_code_object; + hsa_api_.hsa_executable_freeze = hsa_executable_freeze; + hsa_api_.hsa_executable_get_symbol = hsa_executable_get_symbol; + + hsa_api_.hsa_system_get_info = hsa_system_get_info; + hsa_api_.hsa_system_get_major_extension_table = hsa_system_get_major_extension_table; + + hsa_api_.hsa_amd_agent_iterate_memory_pools = hsa_amd_agent_iterate_memory_pools; + hsa_api_.hsa_amd_memory_pool_get_info = hsa_amd_memory_pool_get_info; + hsa_api_.hsa_amd_memory_pool_allocate = hsa_amd_memory_pool_allocate; + hsa_api_.hsa_amd_agents_allow_access = hsa_amd_agents_allow_access; + hsa_api_.hsa_amd_memory_async_copy = hsa_amd_memory_async_copy; + + hsa_api_.hsa_amd_signal_async_handler = hsa_amd_signal_async_handler; + hsa_api_.hsa_amd_profiling_set_profiler_enabled = hsa_amd_profiling_set_profiler_enabled; + hsa_api_.hsa_amd_profiling_get_async_copy_time = hsa_amd_profiling_get_async_copy_time; + hsa_api_.hsa_amd_profiling_get_dispatch_time = hsa_amd_profiling_get_dispatch_time; + } + } +} + hsa_status_t HsaRsrcFactory::LoadAqlProfileLib(aqlprofile_pfn_t* api) { void* handle = dlopen(kAqlProfileLib, RTLD_NOW); if (handle == NULL) { @@ -204,7 +290,7 @@ const AgentInfo* HsaRsrcFactory::AddAgentInfo(const hsa_agent_t agent) { AgentInfo* agent_info = NULL; hsa_device_type_t type; - status = hsa_agent_get_info(agent, HSA_AGENT_INFO_DEVICE, &type); + status = hsa_api_.hsa_agent_get_info(agent, HSA_AGENT_INFO_DEVICE, &type); CHECK_STATUS("Error Calling hsa_agent_get_info", status); if (type == HSA_DEVICE_TYPE_CPU) { @@ -213,9 +299,9 @@ const AgentInfo* HsaRsrcFactory::AddAgentInfo(const hsa_agent_t agent) { agent_info->dev_type = HSA_DEVICE_TYPE_CPU; agent_info->dev_index = cpu_list_.size(); - status = hsa_amd_agent_iterate_memory_pools(agent, FindStandardPool, &agent_info->cpu_pool); + status = hsa_api_.hsa_amd_agent_iterate_memory_pools(agent, FindStandardPool, &agent_info->cpu_pool); if ((status == HSA_STATUS_INFO_BREAK) && (cpu_pool_ == NULL)) cpu_pool_ = &agent_info->cpu_pool; - status = hsa_amd_agent_iterate_memory_pools(agent, FindKernArgPool, &agent_info->kern_arg_pool); + status = hsa_api_.hsa_amd_agent_iterate_memory_pools(agent, FindKernArgPool, &agent_info->kern_arg_pool); if ((status == HSA_STATUS_INFO_BREAK) && (kern_arg_pool_ == NULL)) kern_arg_pool_ = &agent_info->kern_arg_pool; agent_info->gpu_pool = {}; @@ -227,28 +313,28 @@ const AgentInfo* HsaRsrcFactory::AddAgentInfo(const hsa_agent_t agent) { agent_info = new AgentInfo{}; agent_info->dev_id = agent; agent_info->dev_type = HSA_DEVICE_TYPE_GPU; - hsa_agent_get_info(agent, HSA_AGENT_INFO_NAME, agent_info->name); + hsa_api_.hsa_agent_get_info(agent, HSA_AGENT_INFO_NAME, agent_info->name); strncpy(agent_info->gfxip, agent_info->name, 4); agent_info->gfxip[4] = '\0'; - hsa_agent_get_info(agent, HSA_AGENT_INFO_WAVEFRONT_SIZE, &agent_info->max_wave_size); - hsa_agent_get_info(agent, HSA_AGENT_INFO_QUEUE_MAX_SIZE, &agent_info->max_queue_size); - hsa_agent_get_info(agent, HSA_AGENT_INFO_PROFILE, &agent_info->profile); + hsa_api_.hsa_agent_get_info(agent, HSA_AGENT_INFO_WAVEFRONT_SIZE, &agent_info->max_wave_size); + hsa_api_.hsa_agent_get_info(agent, HSA_AGENT_INFO_QUEUE_MAX_SIZE, &agent_info->max_queue_size); + hsa_api_.hsa_agent_get_info(agent, HSA_AGENT_INFO_PROFILE, &agent_info->profile); agent_info->is_apu = (agent_info->profile == HSA_PROFILE_FULL) ? true : false; - hsa_agent_get_info(agent, static_cast(HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT), + hsa_api_.hsa_agent_get_info(agent, static_cast(HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT), &agent_info->cu_num); - hsa_agent_get_info(agent, static_cast(HSA_AMD_AGENT_INFO_MAX_WAVES_PER_CU), + hsa_api_.hsa_agent_get_info(agent, static_cast(HSA_AMD_AGENT_INFO_MAX_WAVES_PER_CU), &agent_info->waves_per_cu); - hsa_agent_get_info(agent, static_cast(HSA_AMD_AGENT_INFO_NUM_SIMDS_PER_CU), + hsa_api_.hsa_agent_get_info(agent, static_cast(HSA_AMD_AGENT_INFO_NUM_SIMDS_PER_CU), &agent_info->simds_per_cu); - hsa_agent_get_info(agent, static_cast(HSA_AMD_AGENT_INFO_NUM_SHADER_ENGINES), + hsa_api_.hsa_agent_get_info(agent, static_cast(HSA_AMD_AGENT_INFO_NUM_SHADER_ENGINES), &agent_info->se_num); - hsa_agent_get_info(agent, + hsa_api_.hsa_agent_get_info(agent, static_cast(HSA_AMD_AGENT_INFO_NUM_SHADER_ARRAYS_PER_SE), &agent_info->shader_arrays_per_se); agent_info->cpu_pool = {}; agent_info->kern_arg_pool = {}; - status = hsa_amd_agent_iterate_memory_pools(agent, FindStandardPool, &agent_info->gpu_pool); + status = hsa_api_.hsa_amd_agent_iterate_memory_pools(agent, FindStandardPool, &agent_info->gpu_pool); CHECK_ITER_STATUS("hsa_amd_agent_iterate_memory_pools(gpu pool)", status); // Set GPU index @@ -339,7 +425,7 @@ bool HsaRsrcFactory::GetCpuAgentInfo(uint32_t idx, const AgentInfo** agent_info) bool HsaRsrcFactory::CreateQueue(const AgentInfo* agent_info, uint32_t num_pkts, hsa_queue_t** queue) { hsa_status_t status; - status = hsa_queue_create(agent_info->dev_id, num_pkts, HSA_QUEUE_TYPE_MULTI, NULL, NULL, + status = hsa_api_.hsa_queue_create(agent_info->dev_id, num_pkts, HSA_QUEUE_TYPE_MULTI, NULL, NULL, UINT32_MAX, UINT32_MAX, queue); return (status == HSA_STATUS_SUCCESS); } @@ -350,7 +436,7 @@ bool HsaRsrcFactory::CreateQueue(const AgentInfo* agent_info, uint32_t num_pkts, // @return bool true if successful, false otherwise bool HsaRsrcFactory::CreateSignal(uint32_t value, hsa_signal_t* signal) { hsa_status_t status; - status = hsa_signal_create(value, 0, NULL, signal); + status = hsa_api_.hsa_signal_create(value, 0, NULL, signal); return (status == HSA_STATUS_SUCCESS); } @@ -363,7 +449,7 @@ uint8_t* HsaRsrcFactory::AllocateLocalMemory(const AgentInfo* agent_info, size_t hsa_status_t status = HSA_STATUS_ERROR; uint8_t* buffer = NULL; size = (size + MEM_PAGE_MASK) & ~MEM_PAGE_MASK; - status = hsa_amd_memory_pool_allocate(agent_info->gpu_pool, size, 0, reinterpret_cast(&buffer)); + status = hsa_api_.hsa_amd_memory_pool_allocate(agent_info->gpu_pool, size, 0, reinterpret_cast(&buffer)); uint8_t* ptr = (status == HSA_STATUS_SUCCESS) ? buffer : NULL; return ptr; } @@ -378,11 +464,11 @@ uint8_t* HsaRsrcFactory::AllocateKernArgMemory(const AgentInfo* agent_info, size uint8_t* buffer = NULL; if (!cpu_agents_.empty()) { size = (size + MEM_PAGE_MASK) & ~MEM_PAGE_MASK; - status = hsa_amd_memory_pool_allocate(*kern_arg_pool_, size, 0, reinterpret_cast(&buffer)); + status = hsa_api_.hsa_amd_memory_pool_allocate(*kern_arg_pool_, size, 0, reinterpret_cast(&buffer)); // Both the CPU and GPU can access the kernel arguments if (status == HSA_STATUS_SUCCESS) { hsa_agent_t ag_list[1] = {agent_info->dev_id}; - status = hsa_amd_agents_allow_access(1, ag_list, NULL, buffer); + status = hsa_api_.hsa_amd_agents_allow_access(1, ag_list, NULL, buffer); } } uint8_t* ptr = (status == HSA_STATUS_SUCCESS) ? buffer : NULL; @@ -398,11 +484,11 @@ uint8_t* HsaRsrcFactory::AllocateSysMemory(const AgentInfo* agent_info, size_t s uint8_t* buffer = NULL; size = (size + MEM_PAGE_MASK) & ~MEM_PAGE_MASK; if (!cpu_agents_.empty()) { - status = hsa_amd_memory_pool_allocate(*cpu_pool_, size, 0, reinterpret_cast(&buffer)); + status = hsa_api_.hsa_amd_memory_pool_allocate(*cpu_pool_, size, 0, reinterpret_cast(&buffer)); // Both the CPU and GPU can access the memory if (status == HSA_STATUS_SUCCESS) { hsa_agent_t ag_list[1] = {agent_info->dev_id}; - status = hsa_amd_agents_allow_access(1, ag_list, NULL, buffer); + status = hsa_api_.hsa_amd_agents_allow_access(1, ag_list, NULL, buffer); } } uint8_t* ptr = (status == HSA_STATUS_SUCCESS) ? buffer : NULL; @@ -426,7 +512,7 @@ uint8_t* HsaRsrcFactory::AllocateCmdMemory(const AgentInfo* agent_info, size_t s void HsaRsrcFactory::SignalWait(const hsa_signal_t& signal) const { while (1) { const hsa_signal_value_t signal_value = - hsa_signal_wait_scacquire(signal, HSA_SIGNAL_CONDITION_LT, 1, timeout_, HSA_WAIT_STATE_BLOCKED); + hsa_api_.hsa_signal_wait_scacquire(signal, HSA_SIGNAL_CONDITION_LT, 1, timeout_, HSA_WAIT_STATE_BLOCKED); if (signal_value == 0) { break; } else { @@ -439,7 +525,7 @@ void HsaRsrcFactory::SignalWait(const hsa_signal_t& signal) const { // Wait signal with signal value restore void HsaRsrcFactory::SignalWaitRestore(const hsa_signal_t& signal, const hsa_signal_value_t& signal_value) const { SignalWait(signal); - hsa_signal_store_relaxed(const_cast(signal), signal_value); + hsa_api_.hsa_signal_store_relaxed(const_cast(signal), signal_value); } // Copy data from GPU to host memory @@ -447,12 +533,12 @@ bool HsaRsrcFactory::Memcpy(const hsa_agent_t& agent, void* dst, const void* src hsa_status_t status = HSA_STATUS_ERROR; if (!cpu_agents_.empty()) { hsa_signal_t s = {}; - status = hsa_signal_create(1, 0, NULL, &s); + status = hsa_api_.hsa_signal_create(1, 0, NULL, &s); CHECK_STATUS("hsa_signal_create()", status); - status = hsa_amd_memory_async_copy(dst, cpu_agents_[0], src, agent, size, 0, NULL, s); + status = hsa_api_.hsa_amd_memory_async_copy(dst, cpu_agents_[0], src, agent, size, 0, NULL, s); CHECK_STATUS("hsa_amd_memory_async_copy()", status); SignalWait(s); - status = hsa_signal_destroy(s); + status = hsa_api_.hsa_signal_destroy(s); CHECK_STATUS("hsa_signal_destroy()", status); } return (status == HSA_STATUS_SUCCESS); @@ -494,29 +580,29 @@ bool HsaRsrcFactory::LoadAndFinalize(const AgentInfo* agent_info, const char* br // Create code object reader hsa_code_object_reader_t code_obj_rdr = {0}; - status = hsa_code_object_reader_create_from_file(file_handle, &code_obj_rdr); + status = hsa_api_.hsa_code_object_reader_create_from_file(file_handle, &code_obj_rdr); if (status != HSA_STATUS_SUCCESS) { std::cerr << "Failed to create code object reader '" << filename << "'" << std::endl; return false; } // Create executable. - status = hsa_executable_create_alt(HSA_PROFILE_FULL, HSA_DEFAULT_FLOAT_ROUNDING_MODE_DEFAULT, + status = hsa_api_.hsa_executable_create_alt(HSA_PROFILE_FULL, HSA_DEFAULT_FLOAT_ROUNDING_MODE_DEFAULT, NULL, executable); CHECK_STATUS("Error in creating executable object", status); // Load code object. - status = hsa_executable_load_agent_code_object(*executable, agent_info->dev_id, code_obj_rdr, + status = hsa_api_.hsa_executable_load_agent_code_object(*executable, agent_info->dev_id, code_obj_rdr, NULL, NULL); CHECK_STATUS("Error in loading executable object", status); // Freeze executable. - status = hsa_executable_freeze(*executable, ""); + status = hsa_api_.hsa_executable_freeze(*executable, ""); CHECK_STATUS("Error in freezing executable object", status); // Get symbol handle. hsa_executable_symbol_t kernelSymbol; - status = hsa_executable_get_symbol(*executable, NULL, kernel_name, agent_info->dev_id, 0, + status = hsa_api_.hsa_executable_get_symbol(*executable, NULL, kernel_name, agent_info->dev_id, 0, &kernelSymbol); CHECK_STATUS("Error in looking up kernel symbol", status); @@ -554,9 +640,9 @@ uint64_t HsaRsrcFactory::Submit(hsa_queue_t* queue, const void* packet) { const uint32_t slot_size_b = CMD_SLOT_SIZE_B; // adevance command queue - const uint64_t write_idx = hsa_queue_load_write_index_relaxed(queue); - hsa_queue_store_write_index_relaxed(queue, write_idx + 1); - while ((write_idx - hsa_queue_load_read_index_relaxed(queue)) >= queue->size) { + const uint64_t write_idx = hsa_api_.hsa_queue_load_write_index_relaxed(queue); + hsa_api_.hsa_queue_store_write_index_relaxed(queue, write_idx + 1); + while ((write_idx - hsa_api_.hsa_queue_load_read_index_relaxed(queue)) >= queue->size) { sched_yield(); } @@ -573,7 +659,7 @@ uint64_t HsaRsrcFactory::Submit(hsa_queue_t* queue, const void* packet) { header_atomic_ptr->store(slot_data[0], std::memory_order_release); // ringdoor bell - hsa_signal_store_relaxed(queue->doorbell_signal, write_idx); + hsa_api_.hsa_signal_store_relaxed(queue->doorbell_signal, write_idx); return write_idx; } @@ -595,9 +681,10 @@ uint64_t HsaRsrcFactory::Submit(hsa_queue_t* queue, const void* packet, size_t s return write_idx; } -HsaRsrcFactory* HsaRsrcFactory::instance_ = NULL; +std::atomic HsaRsrcFactory::instance_{}; HsaRsrcFactory::mutex_t HsaRsrcFactory::mutex_; HsaRsrcFactory::timestamp_t HsaRsrcFactory::timeout_ns_ = HsaTimer::TIMESTAMP_MAX; +hsa_pfn_t HsaRsrcFactory::hsa_api_{}; } // namespace util } // namespace rocprofiler diff --git a/src/util/hsa_rsrc_factory.h b/src/util/hsa_rsrc_factory.h index b3f3cf0d..f982ddde 100644 --- a/src/util/hsa_rsrc_factory.h +++ b/src/util/hsa_rsrc_factory.h @@ -26,6 +26,7 @@ POSSIBILITY OF SUCH DAMAGE. #define SRC_UTIL_HSA_RSRC_FACTORY_H_ #include +#include #include #include #include @@ -35,6 +36,7 @@ POSSIBILITY OF SUCH DAMAGE. #include #include +#include #include #include #include @@ -69,6 +71,46 @@ static const size_t MEM_PAGE_BYTES = 0x1000; static const size_t MEM_PAGE_MASK = MEM_PAGE_BYTES - 1; typedef decltype(hsa_agent_t::handle) hsa_agent_handle_t; +struct hsa_pfn_t { + decltype(hsa_init)* hsa_init; + decltype(hsa_shut_down)* hsa_shut_down; + decltype(hsa_agent_get_info)* hsa_agent_get_info; + decltype(hsa_iterate_agents)* hsa_iterate_agents; + + decltype(hsa_queue_create)* hsa_queue_create; + decltype(hsa_queue_destroy)* hsa_queue_destroy; + decltype(hsa_queue_load_write_index_relaxed)* hsa_queue_load_write_index_relaxed; + decltype(hsa_queue_store_write_index_relaxed)* hsa_queue_store_write_index_relaxed; + decltype(hsa_queue_load_read_index_relaxed)* hsa_queue_load_read_index_relaxed; + + decltype(hsa_signal_create)* hsa_signal_create; + decltype(hsa_signal_destroy)* hsa_signal_destroy; + decltype(hsa_signal_load_relaxed)* hsa_signal_load_relaxed; + decltype(hsa_signal_store_relaxed)* hsa_signal_store_relaxed; + decltype(hsa_signal_wait_scacquire)* hsa_signal_wait_scacquire; + decltype(hsa_signal_store_screlease)* hsa_signal_store_screlease; + + decltype(hsa_code_object_reader_create_from_file)* hsa_code_object_reader_create_from_file; + decltype(hsa_executable_create_alt)* hsa_executable_create_alt; + decltype(hsa_executable_load_agent_code_object)* hsa_executable_load_agent_code_object; + decltype(hsa_executable_freeze)* hsa_executable_freeze; + decltype(hsa_executable_get_symbol)* hsa_executable_get_symbol; + + decltype(hsa_system_get_info)* hsa_system_get_info; + decltype(hsa_system_get_major_extension_table)* hsa_system_get_major_extension_table; + + decltype(hsa_amd_agent_iterate_memory_pools)* hsa_amd_agent_iterate_memory_pools; + decltype(hsa_amd_memory_pool_get_info)* hsa_amd_memory_pool_get_info; + decltype(hsa_amd_memory_pool_allocate)* hsa_amd_memory_pool_allocate; + decltype(hsa_amd_agents_allow_access)* hsa_amd_agents_allow_access; + decltype(hsa_amd_memory_async_copy)* hsa_amd_memory_async_copy; + + decltype(hsa_amd_signal_async_handler)* hsa_amd_signal_async_handler; + decltype(hsa_amd_profiling_set_profiler_enabled)* hsa_amd_profiling_set_profiler_enabled; + decltype(hsa_amd_profiling_get_async_copy_time)* hsa_amd_profiling_get_async_copy_time; + decltype(hsa_amd_profiling_get_dispatch_time)* hsa_amd_profiling_get_dispatch_time; +}; + // Encapsulates information about a Hsa Agent such as its // handle, name, max queue size, max wavefront size, etc. struct AgentInfo { @@ -128,9 +170,9 @@ class HsaTimer { static const timestamp_t TIMESTAMP_MAX = UINT64_MAX; typedef long double freq_t; - HsaTimer() { + HsaTimer(const hsa_pfn_t* hsa_api) : hsa_api_(hsa_api) { timestamp_t sysclock_hz = 0; - hsa_status_t status = hsa_system_get_info(HSA_SYSTEM_INFO_TIMESTAMP_FREQUENCY, &sysclock_hz); + hsa_status_t status = hsa_api_->hsa_system_get_info(HSA_SYSTEM_INFO_TIMESTAMP_FREQUENCY, &sysclock_hz); CHECK_STATUS("hsa_system_get_info(HSA_SYSTEM_INFO_TIMESTAMP_FREQUENCY)", status); sysclock_factor_ = (freq_t)1000000000 / (freq_t)sysclock_hz; } @@ -146,7 +188,7 @@ class HsaTimer { // Return timestamp in 'ns' timestamp_t timestamp_ns() const { timestamp_t sysclock; - hsa_status_t status = hsa_system_get_info(HSA_SYSTEM_INFO_TIMESTAMP, &sysclock); + hsa_status_t status = hsa_api_->hsa_system_get_info(HSA_SYSTEM_INFO_TIMESTAMP, &sysclock); CHECK_STATUS("hsa_system_get_info(HSA_SYSTEM_INFO_TIMESTAMP)", status); return sysclock_to_ns(sysclock); } @@ -154,6 +196,8 @@ class HsaTimer { private: // Timestamp frequency factor freq_t sysclock_factor_; + // HSA API table + const hsa_pfn_t* const hsa_api_; }; class HsaRsrcFactory { @@ -164,17 +208,20 @@ class HsaRsrcFactory { static HsaRsrcFactory* Create(bool initialize_hsa = true) { std::lock_guard lck(mutex_); - if (instance_ == NULL) { - instance_ = new HsaRsrcFactory(initialize_hsa); + HsaRsrcFactory* obj = instance_.load(std::memory_order_relaxed); + if (obj == NULL) { + obj = new HsaRsrcFactory(initialize_hsa); + instance_.store(obj, std::memory_order_release); } - return instance_; + return obj; } static HsaRsrcFactory& Instance() { - if (instance_ == NULL) instance_ = Create(false); - hsa_status_t status = (instance_ != NULL) ? HSA_STATUS_SUCCESS : HSA_STATUS_ERROR; + HsaRsrcFactory* obj = instance_.load(std::memory_order_acquire); + if (obj == NULL) obj = Create(false); + hsa_status_t status = (obj != NULL) ? HSA_STATUS_SUCCESS : HSA_STATUS_ERROR; CHECK_STATUS("HsaRsrcFactory::Instance() failed", status); - return *instance_; + return *obj; } static void Destroy() { @@ -276,6 +323,10 @@ class HsaRsrcFactory { static uint64_t Submit(hsa_queue_t* queue, const void* packet); static uint64_t Submit(hsa_queue_t* queue, const void* packet, size_t size_bytes); + // Initialize HSA API table + void static InitHsaApiTable(HsaApiTable* table); + static const hsa_pfn_t* HsaApi() { return &hsa_api_; } + // Return AqlProfile API table typedef hsa_ven_amd_aqlprofile_pfn_t aqlprofile_pfn_t; const aqlprofile_pfn_t* AqlProfileApi() const { return &aqlprofile_api_; } @@ -293,7 +344,7 @@ class HsaRsrcFactory { static void SetTimeoutNs(const timestamp_t& time) { std::lock_guard lck(mutex_); timeout_ns_ = time; - if (instance_ != NULL) instance_->timeout_ = instance_->timer_->ns_to_sysclock(time); + if (instance_ != NULL) Instance().timeout_ = Instance().timer_->ns_to_sysclock(time); } private: @@ -322,7 +373,7 @@ class HsaRsrcFactory { // HSA was initialized const bool initialize_hsa_; - static HsaRsrcFactory* instance_; + static std::atomic instance_; static mutex_t mutex_; // Used to maintain a list of Hsa Gpu Agent Info @@ -336,6 +387,9 @@ class HsaRsrcFactory { // System agents map std::map agent_map_; + // HSA runtime API table + static hsa_pfn_t hsa_api_; + // AqlProfile API table aqlprofile_pfn_t aqlprofile_api_; diff --git a/src/util/logger.h b/src/util/logger.h index d37f6567..527589f6 100644 --- a/src/util/logger.h +++ b/src/util/logger.h @@ -76,8 +76,16 @@ class Logger { static Logger* Create() { std::lock_guard lck(mutex_); - if (instance_ == NULL) instance_ = new Logger(); - return instance_; + Logger* obj = instance_.load(std::memory_order_relaxed); + if (obj == NULL) { + obj = new Logger(); + if (obj == NULL) { + std::cerr << "ROCProfiler: log object creation failed" << std::endl << std::flush; + abort(); + } + instance_.store(obj, std::memory_order_release); + } + return obj; } static void Destroy() { @@ -87,8 +95,9 @@ class Logger { } static Logger& Instance() { - Create(); - return *instance_; + Logger* obj = instance_.load(std::memory_order_acquire); + if (obj == NULL) obj = Create(); + return *obj; } private: @@ -179,10 +188,10 @@ class Logger { bool messaging_; bool error_; std::string session_dir_; + std::map message_; static mutex_t mutex_; - static Logger* instance_; - std::map message_; + static std::atomic instance_; }; } // namespace util diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index c7d86ccf..7f128e86 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -57,12 +57,12 @@ set ( CTRL_SRC ## Dummy kernel set ( DUMMY_NAME dummy_kernel ) -execute_process ( COMMAND sh -xc "cp ${TEST_DIR}/${DUMMY_NAME}/*.hsaco ${PROJECT_BINARY_DIR}" ) +execute_process ( COMMAND sh -xc "${TEST_DIR}/../bin/build_kernel.sh ${TEST_DIR}/${DUMMY_NAME}/${DUMMY_NAME} ${PROJECT_BINARY_DIR}" ) ## Test kernel set ( TEST_NAME simple_convolution ) set ( KERN_SRC ${TEST_DIR}/${TEST_NAME}/${TEST_NAME}.cpp ) -execute_process ( COMMAND sh -xc "cp ${TEST_DIR}/${TEST_NAME}/*.hsaco ${PROJECT_BINARY_DIR}" ) +execute_process ( COMMAND sh -xc "${TEST_DIR}/../bin/build_kernel.sh ${TEST_DIR}/${TEST_NAME}/${TEST_NAME} ${PROJECT_BINARY_DIR}" ) ## Building standalone test executable add_executable ( ${STEXE_NAME} ${STTST_SRC} ${UTIL_SRC} ${KERN_SRC} ) diff --git a/test/app/intercept_test.cpp b/test/app/intercept_test.cpp index 87e00d64..876b3102 100644 --- a/test/app/intercept_test.cpp +++ b/test/app/intercept_test.cpp @@ -25,9 +25,10 @@ THE SOFTWARE. #include #include +#include #include +#include #include -#include #include "ctrl/run_kernel.h" #include "ctrl/test_aql.h" @@ -36,6 +37,7 @@ THE SOFTWARE. #include "dummy_kernel/dummy_kernel.h" #include "simple_convolution/simple_convolution.h" #include "util/test_assert.h" +#include "util/xml.h" #define PUBLIC_API __attribute__((visibility("default"))) #define CONSTRUCTOR_API __attribute__((constructor)) @@ -45,6 +47,9 @@ THE SOFTWARE. pthread_mutex_t mutex = PTHREAD_RECURSIVE_MUTEX_INITIALIZER_NP; // Tool is unloaded volatile bool is_loaded = false; +// Profiling features +//rocprofiler_feature_t* features = NULL; +//unsigned feature_count = 0; // Error handler void fatal(const std::string msg) { @@ -72,8 +77,19 @@ struct context_entry_t { rocprofiler_callback_data_t data; }; +// Context callback arg +struct callbacks_arg_t { + rocprofiler_pool_t** pools; +}; + +// Handler callback arg +struct handler_arg_t { + rocprofiler_feature_t* features; + unsigned feature_count; +}; + // Dump stored context entry -void dump_context_entry(context_entry_t* entry) { +void dump_context_entry(context_entry_t* entry, rocprofiler_feature_t* features, unsigned feature_count) { volatile std::atomic* valid = reinterpret_cast*>(&entry->valid); while (valid->load() == false) sched_yield(); @@ -97,26 +113,44 @@ void dump_context_entry(context_entry_t* entry) { rocprofiler_group_t& group = entry->group; if (group.context == NULL) { - fprintf(stderr, "tool error: context is NULL\n"); - abort(); + fatal("context is NULL\n"); + } + if (feature_count > 0) { + hsa_status_t status = rocprofiler_group_get_data(&group); + check_status(status); + status = rocprofiler_get_metrics(group.context); + check_status(status); } - rocprofiler_close(group.context); + for (unsigned i = 0; i < feature_count; ++i) { + const rocprofiler_feature_t* p = &features[i]; + fprintf(stdout, "> %s ", p->name); + switch (p->data.kind) { + // Output metrics results + case ROCPROFILER_DATA_KIND_INT64: + fprintf(stdout, "= (%lu)\n", p->data.result_int64); + break; + default: + fprintf(stderr, "Undefined data kind(%u)\n", p->data.kind); + abort(); + } + } } // Profiling completion handler // Dump and delete the context entry // Return true if the context was dumped successfully -bool context_handler(rocprofiler_group_t group, void* arg) { - context_entry_t* entry = reinterpret_cast(arg); +bool context_handler(const rocprofiler_pool_entry_t* entry, void* arg) { + // Context entry + context_entry_t* ctx_entry = reinterpret_cast(entry->payload); + handler_arg_t* handler_arg = reinterpret_cast(arg); if (pthread_mutex_lock(&mutex) != 0) { perror("pthread_mutex_lock"); abort(); } - dump_context_entry(entry); - delete entry; + dump_context_entry(ctx_entry, handler_arg->features, handler_arg->feature_count); if (pthread_mutex_unlock(&mutex) != 0) { perror("pthread_mutex_unlock"); @@ -125,35 +159,65 @@ bool context_handler(rocprofiler_group_t group, void* arg) { return false; } +#if 0 +// Profiling completion handler +// Dump and delete the context entry +// Return true if the context was dumped successfully +bool context_handler1(rocprofiler_group_t group, void* arg) { + context_entry_t* ctx_entry = reinterpret_cast(arg); + if (pthread_mutex_lock(&mutex) != 0) { + perror("pthread_mutex_lock"); + abort(); + } + + dump_context_entry(ctx_entry, features, feature_count); + + if (pthread_mutex_unlock(&mutex) != 0) { + perror("pthread_mutex_unlock"); + abort(); + } + + return false; +} +#endif // Kernel disoatch callback -hsa_status_t dispatch_callback(const rocprofiler_callback_data_t* callback_data, void* /*user_data*/, +hsa_status_t dispatch_callback(const rocprofiler_callback_data_t* callback_data, void* arg, rocprofiler_group_t* group) { + // Passed tool data + hsa_agent_t agent = callback_data->agent; // HSA status hsa_status_t status = HSA_STATUS_ERROR; - // Profiling context - rocprofiler_t* context = NULL; - - // Context entry - context_entry_t* entry = new context_entry_t(); - +#if 1 + // Open profiling context + const unsigned gpu_id = HsaRsrcFactory::Instance().GetAgentInfo(agent)->dev_index; + callbacks_arg_t* callbacks_arg = reinterpret_cast(arg); + rocprofiler_pool_t* pool = callbacks_arg->pools[gpu_id]; + rocprofiler_pool_entry_t pool_entry{}; + status = rocprofiler_pool_fetch(pool, &pool_entry); + check_status(status); + // Profiling context entry + rocprofiler_t* context = pool_entry.context; + context_entry_t* entry = reinterpret_cast(pool_entry.payload); +#else + // Open profiling context // context properties + context_entry_t* entry = new context_entry_t{}; + rocprofiler_t* context = NULL; rocprofiler_properties_t properties{}; - properties.handler = context_handler; + properties.handler = context_handler1; properties.handler_arg = (void*)entry; - - // Open profiling context - status = rocprofiler_open(callback_data->agent, NULL, 0, + status = rocprofiler_open(agent, features, feature_count, &context, 0 /*ROCPROFILER_MODE_SINGLEGROUP*/, &properties); check_status(status); - +#endif // Get group[0] status = rocprofiler_get_group(context, 0, group); check_status(status); // Fill profiling context entry - entry->agent = callback_data->agent; + entry->agent = agent; entry->group = *group; entry->data = *callback_data; entry->data.kernel_name = strdup(callback_data->kernel_name); @@ -162,26 +226,90 @@ hsa_status_t dispatch_callback(const rocprofiler_callback_data_t* callback_data, return HSA_STATUS_SUCCESS; } +unsigned metrics_input(rocprofiler_feature_t** ret) { + // Profiling feature objects + const unsigned feature_count = 9; + rocprofiler_feature_t* features = new rocprofiler_feature_t[feature_count]; + memset(features, 0, feature_count * sizeof(rocprofiler_feature_t)); + + // PMC events + features[0].kind = ROCPROFILER_FEATURE_KIND_METRIC; + features[0].name = "GRBM_COUNT"; + features[1].kind = ROCPROFILER_FEATURE_KIND_METRIC; + features[1].name = "GRBM_GUI_ACTIVE"; + features[2].kind = ROCPROFILER_FEATURE_KIND_METRIC; + features[2].name = "GPUBusy"; + features[3].kind = ROCPROFILER_FEATURE_KIND_METRIC; + features[3].name = "SQ_WAVES"; + features[4].kind = ROCPROFILER_FEATURE_KIND_METRIC; + features[4].name = "SQ_INSTS_VALU"; + features[5].kind = ROCPROFILER_FEATURE_KIND_METRIC; + features[5].name = "VALUInsts"; + features[6].kind = ROCPROFILER_FEATURE_KIND_METRIC; + features[6].name = "TCC_HIT_sum"; + features[7].kind = ROCPROFILER_FEATURE_KIND_METRIC; + features[7].name = "TCC_MISS_sum"; + features[8].kind = ROCPROFILER_FEATURE_KIND_METRIC; + features[8].name = "WRITE_SIZE"; + + *ret = features; + return feature_count; +} + void initialize() { - // Getting GPU device info - const AgentInfo* agent_info = NULL; - if (HsaRsrcFactory::Instance().GetGpuAgentInfo(0, &agent_info) == false) { - fprintf(stderr, "GetGpuAgentInfo failed\n"); - abort(); - } + // Available GPU agents + const unsigned gpu_count = HsaRsrcFactory::Instance().GetCountOfGpuAgents(); + + // Getting profiling features + rocprofiler_feature_t* features = NULL; + unsigned feature_count = metrics_input(&features); + + // Handler arg + handler_arg_t* handler_arg = new handler_arg_t{}; + handler_arg->features = features; + handler_arg->feature_count = feature_count; + + // Context properties + rocprofiler_pool_properties_t properties{}; + properties.num_entries = 100; + properties.payload_bytes = sizeof(context_entry_t); + properties.handler = context_handler; + properties.handler_arg = handler_arg; // Adding dispatch observer + callbacks_arg_t* callbacks_arg = new callbacks_arg_t{}; + callbacks_arg->pools = new rocprofiler_pool_t* [gpu_count]; + for (unsigned gpu_id = 0; gpu_id < gpu_count; gpu_id++) { + // Getting GPU device info + const AgentInfo* agent_info = NULL; + if (HsaRsrcFactory::Instance().GetGpuAgentInfo(gpu_id, &agent_info) == false) { + fprintf(stderr, "GetGpuAgentInfo failed\n"); + abort(); + } + + // Open profiling pool + rocprofiler_pool_t* pool = NULL; + hsa_status_t status = rocprofiler_pool_open(agent_info->dev_id, features, feature_count, + &pool, 0/*ROCPROFILER_MODE_SINGLEGROUP*/, &properties); + check_status(status); + callbacks_arg->pools[gpu_id] = pool; + } + rocprofiler_queue_callbacks_t callbacks_ptrs{}; callbacks_ptrs.dispatch = dispatch_callback; - rocprofiler_set_queue_callbacks(callbacks_ptrs, NULL); + rocprofiler_set_queue_callbacks(callbacks_ptrs, callbacks_arg); } void cleanup() { // Unregister dispatch callback rocprofiler_remove_queue_callbacks(); - - // Dump stored profiling output data - fflush(stdout); + // CLose profiling pool +#if 0 + hsa_status_t status = rocprofiler_pool_flush(pool); + check_status(status); + status = rocprofiler_pool_close(pool); + check_status(status); +#endif } // Tool constructor diff --git a/test/run.sh b/test/run.sh index 580f4713..ed5bbe9a 100755 --- a/test/run.sh +++ b/test/run.sh @@ -22,6 +22,23 @@ # THE SOFTWARE. ################################################################################ +# test check routin +test_status=0 +test_number=0 +eval_test() { + label=$1 + cmdline=$2 + echo "$label: \"$cmdline\"" + eval "$cmdline" + if [ $? != 0 ] ; then + echo "$label: FAILED" + test_status=$(($test_status + 1)) + else + echo "$label: PASSED" + fi + test_number=$(($test_number + 1)) +} + # enable tools load failure reporting export HSA_TOOLS_REPORT_LOAD_FAILURE=1 # paths to ROC profiler and oher libraries @@ -37,12 +54,22 @@ export ROCP_METRICS=metrics.xml # test trace export ROC_TEST_TRACE=1 +## Intercepting usage model test + # tool library loaded by ROC profiler export ROCP_TOOL_LIB=./test/libintercept_test.so -../bin/run_tool.sh ./test/ctrl +export ROCP_KITER=50 +export ROCP_DITER=50 +export ROCP_AGENTS=1 +export ROCP_THRS=1 +eval_test "Intercepting usage model test" "../bin/run_tool.sh ./test/ctrl" + +## Standalone sampling usage model test unset ROCP_TOOL_LIB -eval ./test/standalone_test +eval_test "Standalone sampling usage model test" ./test/standalone_test + +## Libtool test # tool library loaded by ROC profiler export ROCP_TOOL_LIB=libtool.so @@ -61,18 +88,28 @@ export ROCP_DITER=50 export ROCP_AGENTS=1 export ROCP_THRS=1 export ROCP_INPUT=input.xml -eval ./test/ctrl +eval_test "'rocprof' libtool test" ./test/ctrl + +export ROCP_KITER=10 +export ROCP_DITER=10 +export ROCP_AGENTS=1 +export ROCP_THRS=10 +export ROCP_INPUT=input1.xml +eval_test "'rocprof' libtool test n-threads" ./test/ctrl + +## Libtool test, counter sets # Memcopies tracking export ROCP_MCOPY_TRACKING=1 export ROCP_KITER=1 export ROCP_DITER=4 -export ROCP_INPUT=input1.xml -eval ./test/ctrl +export ROCP_INPUT=input2.xml +eval_test "libtool test, counter sets" ./test/ctrl #valgrind --leak-check=full $tbin #valgrind --tool=massif $tbin #ms_print massif.out. -exit 0 +echo "$test_number tests total / $test_status tests failed" +exit $test_status diff --git a/test/tool/input1.xml b/test/tool/input1.xml index 254c83dc..f4ecd178 100644 --- a/test/tool/input1.xml +++ b/test/tool/input1.xml @@ -1,5 +1,14 @@ +# Filter by dispatches range, GPU index and kernel names + + # List of metrics diff --git a/test/tool/input2.xml b/test/tool/input2.xml new file mode 100644 index 00000000..254c83dc --- /dev/null +++ b/test/tool/input2.xml @@ -0,0 +1,5 @@ +# List of metrics + diff --git a/test/tool/tool.cpp b/test/tool/tool.cpp index d96ab12c..0eee2348 100644 --- a/test/tool/tool.cpp +++ b/test/tool/tool.cpp @@ -309,7 +309,7 @@ struct trace_data_arg_t { hsa_agent_t agent; }; -// Trace data callback for getting trace data from GPU local mamory +// Trace data callback for getting trace data from GPU local memory hsa_status_t trace_data_cb(hsa_ven_amd_aqlprofile_info_type_t info_type, hsa_ven_amd_aqlprofile_info_data_t* info_data, void* data) { hsa_status_t status = HSA_STATUS_SUCCESS; @@ -616,7 +616,7 @@ hsa_status_t dispatch_callback(const rocprofiler_callback_data_t* callback_data, kernel_properties_ptr->vgpr_count = kernel_code->reserved_vgpr_count; kernel_properties_ptr->sgpr_count = kernel_code->reserved_sgpr_count; kernel_properties_ptr->fbarrier_count = kernel_code->workgroup_fbarrier_count; - kernel_properties_ptr->signal = packet->completion_signal; + kernel_properties_ptr->signal = callback_data->completion_signal; // context properties rocprofiler_properties_t properties{}; diff --git a/test/util/hsa_rsrc_factory.cpp b/test/util/hsa_rsrc_factory.cpp index 0293c6c4..35568ba0 100644 --- a/test/util/hsa_rsrc_factory.cpp +++ b/test/util/hsa_rsrc_factory.cpp @@ -71,13 +71,13 @@ static hsa_status_t FindGlobalPool(hsa_amd_memory_pool_t pool, void* data, bool return HSA_STATUS_ERROR_INVALID_ARGUMENT; } - err = hsa_amd_memory_pool_get_info(pool, HSA_AMD_MEMORY_POOL_INFO_SEGMENT, &segment); + err = HsaRsrcFactory::HsaApi()->hsa_amd_memory_pool_get_info(pool, HSA_AMD_MEMORY_POOL_INFO_SEGMENT, &segment); CHECK_STATUS("hsa_amd_memory_pool_get_info", err); if (HSA_AMD_SEGMENT_GLOBAL != segment) { return HSA_STATUS_SUCCESS; } - err = hsa_amd_memory_pool_get_info(pool, HSA_AMD_MEMORY_POOL_INFO_GLOBAL_FLAGS, &flag); + err = HsaRsrcFactory::HsaApi()->hsa_amd_memory_pool_get_info(pool, HSA_AMD_MEMORY_POOL_INFO_GLOBAL_FLAGS, &flag); CHECK_STATUS("hsa_amd_memory_pool_get_info", err); uint32_t karg_st = flag & HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_KERNARG_INIT; @@ -111,14 +111,16 @@ HsaRsrcFactory::HsaRsrcFactory(bool initialize_hsa) : initialize_hsa_(initialize cpu_pool_ = NULL; kern_arg_pool_ = NULL; + InitHsaApiTable(NULL); + // Initialize the Hsa Runtime if (initialize_hsa_) { - status = hsa_init(); + status = hsa_api_.hsa_init(); CHECK_STATUS("Error in hsa_init", status); } // Discover the set of Gpu devices available on the platform - status = hsa_iterate_agents(GetHsaAgentsCallback, this); + status = hsa_api_.hsa_iterate_agents(GetHsaAgentsCallback, this); CHECK_STATUS("Error Calling hsa_iterate_agents", status); if (cpu_pool_ == NULL) CHECK_STATUS("CPU memory pool is not found", HSA_STATUS_ERROR); if (kern_arg_pool_ == NULL) CHECK_STATUS("Kern-arg memory pool is not found", HSA_STATUS_ERROR); @@ -128,13 +130,13 @@ HsaRsrcFactory::HsaRsrcFactory(bool initialize_hsa) : initialize_hsa_(initialize #ifdef ROCP_LD_AQLPROFILE status = LoadAqlProfileLib(&aqlprofile_api_); #else - status = hsa_system_get_major_extension_table(HSA_EXTENSION_AMD_AQLPROFILE, hsa_ven_amd_aqlprofile_VERSION_MAJOR, sizeof(aqlprofile_api_), &aqlprofile_api_); + status = hsa_api_.hsa_system_get_major_extension_table(HSA_EXTENSION_AMD_AQLPROFILE, hsa_ven_amd_aqlprofile_VERSION_MAJOR, sizeof(aqlprofile_api_), &aqlprofile_api_); #endif CHECK_STATUS("aqlprofile API table load failed", status); // Get Loader API table loader_api_ = {0}; - status = hsa_system_get_major_extension_table(HSA_EXTENSION_AMD_LOADER, 1, sizeof(loader_api_), &loader_api_); + status = hsa_api_.hsa_system_get_major_extension_table(HSA_EXTENSION_AMD_LOADER, 1, sizeof(loader_api_), &loader_api_); CHECK_STATUS("loader API table query failed", status); // Instantiate HSA timer @@ -152,11 +154,93 @@ HsaRsrcFactory::~HsaRsrcFactory() { for (auto p : cpu_list_) delete p; for (auto p : gpu_list_) delete p; if (initialize_hsa_) { - hsa_status_t status = hsa_shut_down(); + hsa_status_t status = hsa_api_.hsa_shut_down(); CHECK_STATUS("Error in hsa_shut_down", status); } } +void HsaRsrcFactory::InitHsaApiTable(HsaApiTable* table) { + std::lock_guard lck(mutex_); + + if (hsa_api_.hsa_init == NULL) { + if (table != NULL) { + hsa_api_.hsa_init = table->core_->hsa_init_fn; + hsa_api_.hsa_shut_down = table->core_->hsa_shut_down_fn; + hsa_api_.hsa_agent_get_info = table->core_->hsa_agent_get_info_fn; + + hsa_api_.hsa_iterate_agents = table->core_->hsa_iterate_agents_fn; + + hsa_api_.hsa_queue_create = table->core_->hsa_queue_create_fn; + hsa_api_.hsa_queue_destroy = table->core_->hsa_queue_destroy_fn; + hsa_api_.hsa_queue_load_write_index_relaxed = table->core_->hsa_queue_load_write_index_relaxed_fn; + hsa_api_.hsa_queue_store_write_index_relaxed = table->core_->hsa_queue_store_write_index_relaxed_fn; + hsa_api_.hsa_queue_load_read_index_relaxed = table->core_->hsa_queue_load_read_index_relaxed_fn; + hsa_api_.hsa_signal_create = table->core_->hsa_signal_create_fn; + hsa_api_.hsa_signal_destroy = table->core_->hsa_signal_destroy_fn; + hsa_api_.hsa_signal_load_relaxed = table->core_->hsa_signal_load_relaxed_fn; + hsa_api_.hsa_signal_store_relaxed = table->core_->hsa_signal_store_relaxed_fn; + hsa_api_.hsa_signal_store_screlease = table->core_->hsa_signal_store_screlease_fn; + hsa_api_.hsa_signal_wait_scacquire = table->core_->hsa_signal_wait_scacquire_fn; + + hsa_api_.hsa_system_get_major_extension_table = table->core_->hsa_system_get_major_extension_table_fn; + + hsa_api_.hsa_code_object_reader_create_from_file = table->core_->hsa_code_object_reader_create_from_file_fn; + hsa_api_.hsa_executable_create_alt = table->core_->hsa_executable_create_alt_fn; + hsa_api_.hsa_executable_load_agent_code_object = table->core_->hsa_executable_load_agent_code_object_fn; + hsa_api_.hsa_executable_freeze = table->core_->hsa_executable_freeze_fn; + hsa_api_.hsa_executable_get_symbol = table->core_->hsa_executable_get_symbol_fn; + + hsa_api_.hsa_amd_agent_iterate_memory_pools = table->amd_ext_->hsa_amd_agent_iterate_memory_pools_fn; + hsa_api_.hsa_amd_memory_pool_get_info = table->amd_ext_->hsa_amd_memory_pool_get_info_fn; + hsa_api_.hsa_amd_memory_pool_allocate = table->amd_ext_->hsa_amd_memory_pool_allocate_fn; + hsa_api_.hsa_amd_agents_allow_access = table->amd_ext_->hsa_amd_agents_allow_access_fn; + + hsa_api_.hsa_amd_memory_async_copy = table->amd_ext_->hsa_amd_memory_async_copy_fn; + + hsa_api_.hsa_amd_signal_async_handler = table->amd_ext_->hsa_amd_signal_async_handler_fn; + hsa_api_.hsa_amd_profiling_get_async_copy_time = table->amd_ext_->hsa_amd_profiling_get_async_copy_time_fn; + hsa_api_.hsa_amd_profiling_get_dispatch_time = table->amd_ext_->hsa_amd_profiling_get_dispatch_time_fn; + } else { + hsa_api_.hsa_init = hsa_init; + hsa_api_.hsa_shut_down = hsa_shut_down; + hsa_api_.hsa_agent_get_info = hsa_agent_get_info; + + hsa_api_.hsa_iterate_agents = hsa_iterate_agents; + + hsa_api_.hsa_queue_create = hsa_queue_create; + hsa_api_.hsa_queue_destroy = hsa_queue_destroy; + hsa_api_.hsa_queue_load_write_index_relaxed = hsa_queue_load_write_index_relaxed; + hsa_api_.hsa_queue_store_write_index_relaxed = hsa_queue_store_write_index_relaxed; + hsa_api_.hsa_queue_load_read_index_relaxed = hsa_queue_load_read_index_relaxed; + hsa_api_.hsa_signal_create = hsa_signal_create; + hsa_api_.hsa_signal_destroy = hsa_signal_destroy; + hsa_api_.hsa_signal_store_relaxed = hsa_signal_store_relaxed; + hsa_api_.hsa_signal_wait_scacquire = hsa_signal_wait_scacquire; + + hsa_api_.hsa_amd_agent_iterate_memory_pools = hsa_amd_agent_iterate_memory_pools; + hsa_api_.hsa_amd_memory_pool_get_info = hsa_amd_memory_pool_get_info; + hsa_api_.hsa_amd_memory_pool_allocate = hsa_amd_memory_pool_allocate; + hsa_api_.hsa_amd_agents_allow_access = hsa_amd_agents_allow_access; + + hsa_api_.hsa_amd_memory_async_copy = hsa_amd_memory_async_copy; + + hsa_api_.hsa_system_get_major_extension_table = hsa_system_get_major_extension_table; + + hsa_api_.hsa_code_object_reader_create_from_file = hsa_code_object_reader_create_from_file; + hsa_api_.hsa_executable_create_alt = hsa_executable_create_alt; + hsa_api_.hsa_executable_load_agent_code_object = hsa_executable_load_agent_code_object; + hsa_api_.hsa_executable_freeze = hsa_executable_freeze; + hsa_api_.hsa_executable_get_symbol = hsa_executable_get_symbol; + + hsa_api_.hsa_amd_signal_async_handler = hsa_amd_signal_async_handler; + hsa_api_.hsa_amd_profiling_get_async_copy_time = hsa_amd_profiling_get_async_copy_time; + hsa_api_.hsa_amd_profiling_get_dispatch_time = hsa_amd_profiling_get_dispatch_time; + hsa_api_.hsa_signal_load_relaxed = hsa_signal_load_relaxed; + hsa_api_.hsa_signal_store_screlease = hsa_signal_store_screlease; + } + } +} + hsa_status_t HsaRsrcFactory::LoadAqlProfileLib(aqlprofile_pfn_t* api) { void* handle = dlopen(kAqlProfileLib, RTLD_NOW); if (handle == NULL) { @@ -198,7 +282,7 @@ const AgentInfo* HsaRsrcFactory::AddAgentInfo(const hsa_agent_t agent) { AgentInfo* agent_info = NULL; hsa_device_type_t type; - status = hsa_agent_get_info(agent, HSA_AGENT_INFO_DEVICE, &type); + status = hsa_api_.hsa_agent_get_info(agent, HSA_AGENT_INFO_DEVICE, &type); CHECK_STATUS("Error Calling hsa_agent_get_info", status); if (type == HSA_DEVICE_TYPE_CPU) { @@ -207,9 +291,9 @@ const AgentInfo* HsaRsrcFactory::AddAgentInfo(const hsa_agent_t agent) { agent_info->dev_type = HSA_DEVICE_TYPE_CPU; agent_info->dev_index = cpu_list_.size(); - status = hsa_amd_agent_iterate_memory_pools(agent, FindStandardPool, &agent_info->cpu_pool); + status = hsa_api_.hsa_amd_agent_iterate_memory_pools(agent, FindStandardPool, &agent_info->cpu_pool); if ((status == HSA_STATUS_INFO_BREAK) && (cpu_pool_ == NULL)) cpu_pool_ = &agent_info->cpu_pool; - status = hsa_amd_agent_iterate_memory_pools(agent, FindKernArgPool, &agent_info->kern_arg_pool); + status = hsa_api_.hsa_amd_agent_iterate_memory_pools(agent, FindKernArgPool, &agent_info->kern_arg_pool); if ((status == HSA_STATUS_INFO_BREAK) && (kern_arg_pool_ == NULL)) kern_arg_pool_ = &agent_info->kern_arg_pool; agent_info->gpu_pool = {}; @@ -221,28 +305,28 @@ const AgentInfo* HsaRsrcFactory::AddAgentInfo(const hsa_agent_t agent) { agent_info = new AgentInfo{}; agent_info->dev_id = agent; agent_info->dev_type = HSA_DEVICE_TYPE_GPU; - hsa_agent_get_info(agent, HSA_AGENT_INFO_NAME, agent_info->name); + hsa_api_.hsa_agent_get_info(agent, HSA_AGENT_INFO_NAME, agent_info->name); strncpy(agent_info->gfxip, agent_info->name, 4); agent_info->gfxip[4] = '\0'; - hsa_agent_get_info(agent, HSA_AGENT_INFO_WAVEFRONT_SIZE, &agent_info->max_wave_size); - hsa_agent_get_info(agent, HSA_AGENT_INFO_QUEUE_MAX_SIZE, &agent_info->max_queue_size); - hsa_agent_get_info(agent, HSA_AGENT_INFO_PROFILE, &agent_info->profile); + hsa_api_.hsa_agent_get_info(agent, HSA_AGENT_INFO_WAVEFRONT_SIZE, &agent_info->max_wave_size); + hsa_api_.hsa_agent_get_info(agent, HSA_AGENT_INFO_QUEUE_MAX_SIZE, &agent_info->max_queue_size); + hsa_api_.hsa_agent_get_info(agent, HSA_AGENT_INFO_PROFILE, &agent_info->profile); agent_info->is_apu = (agent_info->profile == HSA_PROFILE_FULL) ? true : false; - hsa_agent_get_info(agent, static_cast(HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT), + hsa_api_.hsa_agent_get_info(agent, static_cast(HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT), &agent_info->cu_num); - hsa_agent_get_info(agent, static_cast(HSA_AMD_AGENT_INFO_MAX_WAVES_PER_CU), + hsa_api_.hsa_agent_get_info(agent, static_cast(HSA_AMD_AGENT_INFO_MAX_WAVES_PER_CU), &agent_info->waves_per_cu); - hsa_agent_get_info(agent, static_cast(HSA_AMD_AGENT_INFO_NUM_SIMDS_PER_CU), + hsa_api_.hsa_agent_get_info(agent, static_cast(HSA_AMD_AGENT_INFO_NUM_SIMDS_PER_CU), &agent_info->simds_per_cu); - hsa_agent_get_info(agent, static_cast(HSA_AMD_AGENT_INFO_NUM_SHADER_ENGINES), + hsa_api_.hsa_agent_get_info(agent, static_cast(HSA_AMD_AGENT_INFO_NUM_SHADER_ENGINES), &agent_info->se_num); - hsa_agent_get_info(agent, + hsa_api_.hsa_agent_get_info(agent, static_cast(HSA_AMD_AGENT_INFO_NUM_SHADER_ARRAYS_PER_SE), &agent_info->shader_arrays_per_se); agent_info->cpu_pool = {}; agent_info->kern_arg_pool = {}; - status = hsa_amd_agent_iterate_memory_pools(agent, FindStandardPool, &agent_info->gpu_pool); + status = hsa_api_.hsa_amd_agent_iterate_memory_pools(agent, FindStandardPool, &agent_info->gpu_pool); CHECK_ITER_STATUS("hsa_amd_agent_iterate_memory_pools(gpu pool)", status); // Set GPU index @@ -333,7 +417,7 @@ bool HsaRsrcFactory::GetCpuAgentInfo(uint32_t idx, const AgentInfo** agent_info) bool HsaRsrcFactory::CreateQueue(const AgentInfo* agent_info, uint32_t num_pkts, hsa_queue_t** queue) { hsa_status_t status; - status = hsa_queue_create(agent_info->dev_id, num_pkts, HSA_QUEUE_TYPE_MULTI, NULL, NULL, + status = hsa_api_.hsa_queue_create(agent_info->dev_id, num_pkts, HSA_QUEUE_TYPE_MULTI, NULL, NULL, UINT32_MAX, UINT32_MAX, queue); return (status == HSA_STATUS_SUCCESS); } @@ -344,7 +428,7 @@ bool HsaRsrcFactory::CreateQueue(const AgentInfo* agent_info, uint32_t num_pkts, // @return bool true if successful, false otherwise bool HsaRsrcFactory::CreateSignal(uint32_t value, hsa_signal_t* signal) { hsa_status_t status; - status = hsa_signal_create(value, 0, NULL, signal); + status = hsa_api_.hsa_signal_create(value, 0, NULL, signal); return (status == HSA_STATUS_SUCCESS); } @@ -357,7 +441,7 @@ uint8_t* HsaRsrcFactory::AllocateLocalMemory(const AgentInfo* agent_info, size_t hsa_status_t status = HSA_STATUS_ERROR; uint8_t* buffer = NULL; size = (size + MEM_PAGE_MASK) & ~MEM_PAGE_MASK; - status = hsa_amd_memory_pool_allocate(agent_info->gpu_pool, size, 0, reinterpret_cast(&buffer)); + status = hsa_api_.hsa_amd_memory_pool_allocate(agent_info->gpu_pool, size, 0, reinterpret_cast(&buffer)); uint8_t* ptr = (status == HSA_STATUS_SUCCESS) ? buffer : NULL; return ptr; } @@ -372,11 +456,11 @@ uint8_t* HsaRsrcFactory::AllocateKernArgMemory(const AgentInfo* agent_info, size uint8_t* buffer = NULL; if (!cpu_agents_.empty()) { size = (size + MEM_PAGE_MASK) & ~MEM_PAGE_MASK; - status = hsa_amd_memory_pool_allocate(*kern_arg_pool_, size, 0, reinterpret_cast(&buffer)); + status = hsa_api_.hsa_amd_memory_pool_allocate(*kern_arg_pool_, size, 0, reinterpret_cast(&buffer)); // Both the CPU and GPU can access the kernel arguments if (status == HSA_STATUS_SUCCESS) { hsa_agent_t ag_list[1] = {agent_info->dev_id}; - status = hsa_amd_agents_allow_access(1, ag_list, NULL, buffer); + status = hsa_api_.hsa_amd_agents_allow_access(1, ag_list, NULL, buffer); } } uint8_t* ptr = (status == HSA_STATUS_SUCCESS) ? buffer : NULL; @@ -392,11 +476,11 @@ uint8_t* HsaRsrcFactory::AllocateSysMemory(const AgentInfo* agent_info, size_t s uint8_t* buffer = NULL; size = (size + MEM_PAGE_MASK) & ~MEM_PAGE_MASK; if (!cpu_agents_.empty()) { - status = hsa_amd_memory_pool_allocate(*cpu_pool_, size, 0, reinterpret_cast(&buffer)); + status = hsa_api_.hsa_amd_memory_pool_allocate(*cpu_pool_, size, 0, reinterpret_cast(&buffer)); // Both the CPU and GPU can access the memory if (status == HSA_STATUS_SUCCESS) { hsa_agent_t ag_list[1] = {agent_info->dev_id}; - status = hsa_amd_agents_allow_access(1, ag_list, NULL, buffer); + status = hsa_api_.hsa_amd_agents_allow_access(1, ag_list, NULL, buffer); } } uint8_t* ptr = (status == HSA_STATUS_SUCCESS) ? buffer : NULL; @@ -420,7 +504,7 @@ uint8_t* HsaRsrcFactory::AllocateCmdMemory(const AgentInfo* agent_info, size_t s void HsaRsrcFactory::SignalWait(const hsa_signal_t& signal) const { while (1) { const hsa_signal_value_t signal_value = - hsa_signal_wait_scacquire(signal, HSA_SIGNAL_CONDITION_LT, 1, timeout_, HSA_WAIT_STATE_BLOCKED); + hsa_api_.hsa_signal_wait_scacquire(signal, HSA_SIGNAL_CONDITION_LT, 1, timeout_, HSA_WAIT_STATE_BLOCKED); if (signal_value == 0) { break; } else { @@ -432,7 +516,7 @@ void HsaRsrcFactory::SignalWait(const hsa_signal_t& signal) const { // Wait signal with signal value restore void HsaRsrcFactory::SignalWaitRestore(const hsa_signal_t& signal, const hsa_signal_value_t& signal_value) const { SignalWait(signal); - hsa_signal_store_relaxed(const_cast(signal), signal_value); + hsa_api_.hsa_signal_store_relaxed(const_cast(signal), signal_value); } // Copy data from GPU to host memory @@ -440,12 +524,12 @@ bool HsaRsrcFactory::Memcpy(const hsa_agent_t& agent, void* dst, const void* src hsa_status_t status = HSA_STATUS_ERROR; if (!cpu_agents_.empty()) { hsa_signal_t s = {}; - status = hsa_signal_create(1, 0, NULL, &s); + status = hsa_api_.hsa_signal_create(1, 0, NULL, &s); CHECK_STATUS("hsa_signal_create()", status); - status = hsa_amd_memory_async_copy(dst, cpu_agents_[0], src, agent, size, 0, NULL, s); + status = hsa_api_.hsa_amd_memory_async_copy(dst, cpu_agents_[0], src, agent, size, 0, NULL, s); CHECK_STATUS("hsa_amd_memory_async_copy()", status); SignalWait(s); - status = hsa_signal_destroy(s); + status = hsa_api_.hsa_signal_destroy(s); CHECK_STATUS("hsa_signal_destroy()", status); } return (status == HSA_STATUS_SUCCESS); @@ -487,29 +571,29 @@ bool HsaRsrcFactory::LoadAndFinalize(const AgentInfo* agent_info, const char* br // Create code object reader hsa_code_object_reader_t code_obj_rdr = {0}; - status = hsa_code_object_reader_create_from_file(file_handle, &code_obj_rdr); + status = hsa_api_.hsa_code_object_reader_create_from_file(file_handle, &code_obj_rdr); if (status != HSA_STATUS_SUCCESS) { std::cerr << "Failed to create code object reader '" << filename << "'" << std::endl; return false; } // Create executable. - status = hsa_executable_create_alt(HSA_PROFILE_FULL, HSA_DEFAULT_FLOAT_ROUNDING_MODE_DEFAULT, + status = hsa_api_.hsa_executable_create_alt(HSA_PROFILE_FULL, HSA_DEFAULT_FLOAT_ROUNDING_MODE_DEFAULT, NULL, executable); CHECK_STATUS("Error in creating executable object", status); // Load code object. - status = hsa_executable_load_agent_code_object(*executable, agent_info->dev_id, code_obj_rdr, + status = hsa_api_.hsa_executable_load_agent_code_object(*executable, agent_info->dev_id, code_obj_rdr, NULL, NULL); CHECK_STATUS("Error in loading executable object", status); // Freeze executable. - status = hsa_executable_freeze(*executable, ""); + status = hsa_api_.hsa_executable_freeze(*executable, ""); CHECK_STATUS("Error in freezing executable object", status); // Get symbol handle. hsa_executable_symbol_t kernelSymbol; - status = hsa_executable_get_symbol(*executable, NULL, kernel_name, agent_info->dev_id, 0, + status = hsa_api_.hsa_executable_get_symbol(*executable, NULL, kernel_name, agent_info->dev_id, 0, &kernelSymbol); CHECK_STATUS("Error in looking up kernel symbol", status); @@ -547,9 +631,9 @@ uint64_t HsaRsrcFactory::Submit(hsa_queue_t* queue, const void* packet) { const uint32_t slot_size_b = CMD_SLOT_SIZE_B; // adevance command queue - const uint64_t write_idx = hsa_queue_load_write_index_relaxed(queue); - hsa_queue_store_write_index_relaxed(queue, write_idx + 1); - while ((write_idx - hsa_queue_load_read_index_relaxed(queue)) >= queue->size) { + const uint64_t write_idx = hsa_api_.hsa_queue_load_write_index_relaxed(queue); + hsa_api_.hsa_queue_store_write_index_relaxed(queue, write_idx + 1); + while ((write_idx - hsa_api_.hsa_queue_load_read_index_relaxed(queue)) >= queue->size) { sched_yield(); } @@ -566,7 +650,7 @@ uint64_t HsaRsrcFactory::Submit(hsa_queue_t* queue, const void* packet) { header_atomic_ptr->store(slot_data[0], std::memory_order_release); // ringdoor bell - hsa_signal_store_relaxed(queue->doorbell_signal, write_idx); + hsa_api_.hsa_signal_store_relaxed(queue->doorbell_signal, write_idx); return write_idx; } @@ -588,6 +672,7 @@ uint64_t HsaRsrcFactory::Submit(hsa_queue_t* queue, const void* packet, size_t s return write_idx; } -HsaRsrcFactory* HsaRsrcFactory::instance_ = NULL; +std::atomic HsaRsrcFactory::instance_{}; HsaRsrcFactory::mutex_t HsaRsrcFactory::mutex_; HsaRsrcFactory::timestamp_t HsaRsrcFactory::timeout_ns_ = HsaTimer::TIMESTAMP_MAX; +hsa_pfn_t HsaRsrcFactory::hsa_api_{}; diff --git a/test/util/hsa_rsrc_factory.h b/test/util/hsa_rsrc_factory.h index 738a8e2f..552789cc 100644 --- a/test/util/hsa_rsrc_factory.h +++ b/test/util/hsa_rsrc_factory.h @@ -26,6 +26,7 @@ POSSIBILITY OF SUCH DAMAGE. #define TEST_UTIL_HSA_RSRC_FACTORY_H_ #include +#include #include #include #include @@ -35,6 +36,7 @@ POSSIBILITY OF SUCH DAMAGE. #include #include +#include #include #include #include @@ -67,6 +69,44 @@ static const size_t MEM_PAGE_BYTES = 0x1000; static const size_t MEM_PAGE_MASK = MEM_PAGE_BYTES - 1; typedef decltype(hsa_agent_t::handle) hsa_agent_handle_t; +struct hsa_pfn_t { + decltype(hsa_init)* hsa_init; + decltype(hsa_shut_down)* hsa_shut_down; + decltype(hsa_agent_get_info)* hsa_agent_get_info; + + decltype(hsa_iterate_agents)* hsa_iterate_agents; + + decltype(hsa_queue_create)* hsa_queue_create; + decltype(hsa_queue_destroy)* hsa_queue_destroy; + decltype(hsa_queue_load_write_index_relaxed)* hsa_queue_load_write_index_relaxed; + decltype(hsa_queue_store_write_index_relaxed)* hsa_queue_store_write_index_relaxed; + decltype(hsa_queue_load_read_index_relaxed)* hsa_queue_load_read_index_relaxed; + decltype(hsa_signal_create)* hsa_signal_create; + decltype(hsa_signal_destroy)* hsa_signal_destroy; + decltype(hsa_signal_store_relaxed)* hsa_signal_store_relaxed; + decltype(hsa_signal_wait_scacquire)* hsa_signal_wait_scacquire; + + decltype(hsa_amd_agent_iterate_memory_pools)* hsa_amd_agent_iterate_memory_pools; + decltype(hsa_amd_memory_pool_get_info)* hsa_amd_memory_pool_get_info; + decltype(hsa_amd_memory_pool_allocate)* hsa_amd_memory_pool_allocate; + decltype(hsa_amd_agents_allow_access)* hsa_amd_agents_allow_access; + decltype(hsa_amd_memory_async_copy)* hsa_amd_memory_async_copy; + + decltype(hsa_system_get_major_extension_table)* hsa_system_get_major_extension_table; + + decltype(hsa_code_object_reader_create_from_file)* hsa_code_object_reader_create_from_file; + decltype(hsa_executable_create_alt)* hsa_executable_create_alt; + decltype(hsa_executable_load_agent_code_object)* hsa_executable_load_agent_code_object; + decltype(hsa_executable_freeze)* hsa_executable_freeze; + decltype(hsa_executable_get_symbol)* hsa_executable_get_symbol; + + decltype(hsa_amd_signal_async_handler)* hsa_amd_signal_async_handler; + decltype(hsa_amd_profiling_get_async_copy_time)* hsa_amd_profiling_get_async_copy_time; + decltype(hsa_amd_profiling_get_dispatch_time)* hsa_amd_profiling_get_dispatch_time; + decltype(hsa_signal_load_relaxed)* hsa_signal_load_relaxed; + decltype(hsa_signal_store_screlease)* hsa_signal_store_screlease; +}; + // Encapsulates information about a Hsa Agent such as its // handle, name, max queue size, max wavefront size, etc. struct AgentInfo { @@ -133,7 +173,7 @@ class HsaTimer { sysclock_factor_ = (freq_t)1000000000 / (freq_t)sysclock_hz; } - // Methids for system-clock/ns conversion + // Methods for system-clock/ns conversion timestamp_t sysclock_to_ns(const timestamp_t& sysclock) const { return timestamp_t((freq_t)sysclock * sysclock_factor_); } @@ -162,17 +202,20 @@ class HsaRsrcFactory { static HsaRsrcFactory* Create(bool initialize_hsa = true) { std::lock_guard lck(mutex_); - if (instance_ == NULL) { - instance_ = new HsaRsrcFactory(initialize_hsa); + HsaRsrcFactory* obj = instance_.load(std::memory_order_relaxed); + if (obj == NULL) { + obj = new HsaRsrcFactory(initialize_hsa); + instance_.store(obj, std::memory_order_release); } - return instance_; + return obj; } static HsaRsrcFactory& Instance() { - if (instance_ == NULL) instance_ = Create(false); - hsa_status_t status = (instance_ != NULL) ? HSA_STATUS_SUCCESS : HSA_STATUS_ERROR; + HsaRsrcFactory* obj = instance_.load(std::memory_order_acquire); + if (obj == NULL) obj = Create(false); + hsa_status_t status = (obj != NULL) ? HSA_STATUS_SUCCESS : HSA_STATUS_ERROR; CHECK_STATUS("HsaRsrcFactory::Instance() failed", status); - return *instance_; + return *obj; } static void Destroy() { @@ -274,6 +317,10 @@ class HsaRsrcFactory { static uint64_t Submit(hsa_queue_t* queue, const void* packet); static uint64_t Submit(hsa_queue_t* queue, const void* packet, size_t size_bytes); + // Initialize HSA API table + void static InitHsaApiTable(HsaApiTable* table); + static const hsa_pfn_t* HsaApi() { return &hsa_api_; } + // Return AqlProfile API table typedef hsa_ven_amd_aqlprofile_pfn_t aqlprofile_pfn_t; const aqlprofile_pfn_t* AqlProfileApi() const { return &aqlprofile_api_; } @@ -291,7 +338,7 @@ class HsaRsrcFactory { static void SetTimeoutNs(const timestamp_t& time) { std::lock_guard lck(mutex_); timeout_ns_ = time; - if (instance_ != NULL) instance_->timeout_ = instance_->timer_->ns_to_sysclock(time); + if (instance_ != NULL) Instance().timeout_ = Instance().timer_->ns_to_sysclock(time); } private: @@ -320,7 +367,7 @@ class HsaRsrcFactory { // HSA was initialized const bool initialize_hsa_; - static HsaRsrcFactory* instance_; + static std::atomic instance_; static mutex_t mutex_; // Used to maintain a list of Hsa Gpu Agent Info @@ -334,6 +381,9 @@ class HsaRsrcFactory { // System agents map std::map agent_map_; + // HSA runtime API table + static hsa_pfn_t hsa_api_; + // AqlProfile API table aqlprofile_pfn_t aqlprofile_api_; From 6bc8b939bf6e1e85841ba3e120ba057a78e886ce Mon Sep 17 00:00:00 2001 From: eshcherb <33529668+eshcherb@users.noreply.github.com> Date: Thu, 4 Apr 2019 21:39:31 -0500 Subject: [PATCH 051/153] Update README.md --- README.md | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/README.md b/README.md index 711bc86b..311ae05e 100644 --- a/README.md +++ b/README.md @@ -104,6 +104,24 @@ Options: > -o - output CSV file [.csv] + The output CSV file columns meaning in the columns order: + Index - kernels dispatch order index + KernelName - the dispatched kernel name + gpu-id - GPU id the kernel was submitted to + queue-id - the ROCm queue unique id the kernel was submitted to + queue-index - The ROCm queue write index for the submitted AQL packet + tid - system application thread id which submitted the kernel + grd - the kernel's grid size + wgr - the kernel's work group size + lds - the kernel's LDS memory size + scr - the kernel's scratch memory size + vgpr - the kernel's VGPR size + sgpr - the kernel's SGPR size + fbar - the kernel's barriers limitation + sig - the kernel's completion signal + The columns with the counters values per kernel dispatch + DispatchNs/BeginNs/EndNs/CompleteNs timestamp columns if time-stamping was enabled + -d - directory where profiler store profiling data including thread treaces [/tmp] The data directory is renoving autonatically if the directory is matching the temporary one, which is the default. -t - to change the temporary directory [/tmp] From 2efc120ec5a785edc903be34d64f6aa12855f456 Mon Sep 17 00:00:00 2001 From: eshcherb <33529668+eshcherb@users.noreply.github.com> Date: Thu, 4 Apr 2019 21:40:37 -0500 Subject: [PATCH 052/153] Update README.md --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 311ae05e..316597ae 100644 --- a/README.md +++ b/README.md @@ -119,8 +119,8 @@ Options: sgpr - the kernel's SGPR size fbar - the kernel's barriers limitation sig - the kernel's completion signal - The columns with the counters values per kernel dispatch - DispatchNs/BeginNs/EndNs/CompleteNs timestamp columns if time-stamping was enabled + ... - The columns with the counters values per kernel dispatch + DispatchNs/BeginNs/EndNs/CompleteNs - timestamp columns if time-stamping was enabled -d - directory where profiler store profiling data including thread treaces [/tmp] The data directory is renoving autonatically if the directory is matching the temporary one, which is the default. From 83703a97fa9188ff01d1dbb2a8b134ac3697566f Mon Sep 17 00:00:00 2001 From: eshcherb <33529668+eshcherb@users.noreply.github.com> Date: Thu, 4 Apr 2019 21:47:25 -0500 Subject: [PATCH 053/153] Update README.md --- README.md | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/README.md b/README.md index 316597ae..a3ca363b 100644 --- a/README.md +++ b/README.md @@ -129,6 +129,12 @@ Options: --basenames - to turn on/off truncating of the kernel full function names till the base ones [off] --timestamp - to turn on/off the kernel dispatches timestamps, dispatch/begin/end/complete [off] + Four kernel timestamps in nanoseconds are reported: + DispatchNs - the time when the kernel AQL dispatch packet was written to the queue + BeginNs - the kernel execution begin time + EndNs - the kernel execution end time + CompleteNs - the time when the completion signal of the AQL dispatch packet was received + --ctx-limit - maximum number of outstanding contexts [0 - unlimited] --heartbeat - to print progress heartbeats [0 - disabled] From 44198f37db96a11827dbc0397613ccb61efce7cb Mon Sep 17 00:00:00 2001 From: Evgeny Date: Thu, 4 Apr 2019 22:55:18 -0500 Subject: [PATCH 054/153] N-GPU standalone intercept --- src/core/rocprofiler.cpp | 42 ++++++++++++++++++++++++++++------------ 1 file changed, 30 insertions(+), 12 deletions(-) diff --git a/src/core/rocprofiler.cpp b/src/core/rocprofiler.cpp index de16fa19..81e146a0 100644 --- a/src/core/rocprofiler.cpp +++ b/src/core/rocprofiler.cpp @@ -233,13 +233,12 @@ hsa_status_t GetExcStatus(const std::exception& e) { : HSA_STATUS_ERROR; } - -inline size_t CreateEnableCmd(const hsa_agent_t& agent, packet_t* command, const size_t& slot_count) { - rocprofiler::util::HsaRsrcFactory* hsa_rsrc = &rocprofiler::util::HsaRsrcFactory::Instance(); - const rocprofiler::util::AgentInfo* agent_info = hsa_rsrc->GetAgentInfo(agent); +inline size_t CreateEnableCmd(const rocprofiler::util::AgentInfo* agent_info, packet_t* command, const size_t& slot_count) { const bool is_legacy = (strncmp(agent_info->name, "gfx8", 4) == 0); const size_t packet_count = (is_legacy) ? Profile::LEGACY_SLOT_SIZE_PKT : 1; + rocprofiler::util::HsaRsrcFactory* hsa_rsrc = &rocprofiler::util::HsaRsrcFactory::Instance(); + if (packet_count > slot_count) EXC_RAISING(HSA_STATUS_ERROR, "packet_count > slot_count"); // AQLprofile object @@ -291,9 +290,13 @@ hsa_status_t CreateQueuePro( uint32_t group_segment_size, hsa_queue_t **queue) { - static packet_t enable_cmd_packet[Profile::LEGACY_SLOT_SIZE_PKT]; - static size_t enable_cmd_size = 0; - static std::mutex enable_cmd_mutex; + typedef std::pair cmd_entry_t; + typedef std::vector cmd_vec_t; + static cmd_vec_t cmd_vec; + static uint32_t cmd_mask = 0; + static std::mutex cmd_mutex; + + rocprofiler::util::HsaRsrcFactory* hsa_rsrc = &rocprofiler::util::HsaRsrcFactory::Instance(); // Create HSA queue hsa_status_t status = hsa_queue_create_fn( @@ -308,15 +311,30 @@ hsa_status_t CreateQueuePro( if (status != HSA_STATUS_SUCCESS) return status; // Create 'Enable' cmd packet - if (enable_cmd_size == 0) { - std::lock_guard lck(enable_cmd_mutex); - if (enable_cmd_size == 0) { - enable_cmd_size = CreateEnableCmd(agent, enable_cmd_packet, Profile::LEGACY_SLOT_SIZE_PKT); + const rocprofiler::util::AgentInfo* agent_info = hsa_rsrc->GetAgentInfo(agent); + const uint32_t dev_index = 1 << agent_info->dev_index; + const uint32_t dev_mask = 1 << dev_index; + if ((cmd_mask & dev_mask) == 0) { + std::lock_guard lck(cmd_mutex); + + if ((cmd_mask & dev_mask) == 0) { + cmd_mask |= dev_mask; + // Allocating cmd vector + uint32_t mask = 1; + while (1) { + const uint32_t max = 1 << cmd_vec.size(); + if (mask >= max) cmd_vec.push_back({}); + if (((mask & dev_mask) != 0) || (mask == 0)) break; + mask <<= 1; + } + if (mask == 0) EXC_RAISING(status, "bad device index (" << dev_index << ")"); + // Creating cmd packets + cmd_vec[dev_index].second = CreateEnableCmd(agent_info, cmd_vec[dev_index].first, Profile::LEGACY_SLOT_SIZE_PKT); } } // Enable counters for the queue - rocprofiler::util::HsaRsrcFactory::Instance().Submit(*queue, enable_cmd_packet, enable_cmd_size); + rocprofiler::util::HsaRsrcFactory::Instance().Submit(*queue, cmd_vec[dev_index].first, cmd_vec[dev_index].second); return HSA_STATUS_SUCCESS; } From d36407aa9e31787291875399ba9141cb4a360422 Mon Sep 17 00:00:00 2001 From: Rene van Oostrum Date: Thu, 11 Apr 2019 11:12:36 -0500 Subject: [PATCH 055/153] fix overwriting of LD_LIBRARY_PATH; instead, prepend to old path --- bin/rpl_run.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bin/rpl_run.sh b/bin/rpl_run.sh index 78edf446..527c99c1 100755 --- a/bin/rpl_run.sh +++ b/bin/rpl_run.sh @@ -43,7 +43,7 @@ HIP_TRACE=0 # Generate stats GEN_STATS=0 -export LD_LIBRARY_PATH=$PKG_DIR/lib:$PKG_DIR/tool:$ROCTRACER_PATH/lib:$ROCTRACER_PATH/tool:$HSA_PATH +export LD_LIBRARY_PATH=$PKG_DIR/lib:$PKG_DIR/tool:$ROCTRACER_PATH/lib:$ROCTRACER_PATH/tool:$HSA_PATH:$LD_LIBRARY_PATH export PATH=.:$PATH # enable error logging From 968f3bc0787e0b50a0eb1f6d7a9d8855eb31d976 Mon Sep 17 00:00:00 2001 From: Evgeny Date: Mon, 8 Jul 2019 19:46:42 -0500 Subject: [PATCH 056/153] 2.6 update --- bin/rpl_run.sh | 22 ++++---- bin/tblextr.py | 13 +++-- cmake_modules/env.cmake | 2 + inc/rocprofiler.h | 7 ++- src/core/context.h | 46 ++++++++++------ src/core/profile.h | 18 ++++--- src/core/rocprofiler.cpp | 12 ++--- src/core/types.h | 12 +++++ test/app/standalone_test.cpp | 17 ++++++ test/run.sh | 26 ++++++--- test/tool/tool.cpp | 102 ++++++++++++++++++++++++++--------- 11 files changed, 195 insertions(+), 82 deletions(-) diff --git a/bin/rpl_run.sh b/bin/rpl_run.sh index 78edf446..f29582a8 100755 --- a/bin/rpl_run.sh +++ b/bin/rpl_run.sh @@ -30,12 +30,12 @@ RUN_DIR=`pwd` TMP_DIR="/tmp" DATA_DIR="rpl_data_${time_stamp}_$$" +RPL_PATH=$PKG_DIR/lib +TLIB_PATH=$PKG_DIR/tool + # PATH to custom HSA and OpenCl runtimes HSA_PATH=$PKG_DIR/lib/hsa -# roctracer path -if [ -z "$ROCTRACER_PATH" ] ; then ROCTRACER_PATH=$ROOT_DIR/roctracer; fi - # runtime API trace HSA_TRACE=0 HIP_TRACE=0 @@ -43,7 +43,7 @@ HIP_TRACE=0 # Generate stats GEN_STATS=0 -export LD_LIBRARY_PATH=$PKG_DIR/lib:$PKG_DIR/tool:$ROCTRACER_PATH/lib:$ROCTRACER_PATH/tool:$HSA_PATH +export LD_LIBRARY_PATH=$HSA_PATH:$LD_LIBRARY_PATH export PATH=.:$PATH # enable error logging @@ -54,9 +54,9 @@ unset ROCPROFILER_SESS # ROC Profiler environment # Loading of ROC Profiler by HSA runtime -export HSA_TOOLS_LIB=librocprofiler64.so +export HSA_TOOLS_LIB=$RPL_PATH/librocprofiler64.so # Loading of the test tool by ROC Profiler -export ROCP_TOOL_LIB=libtool.so +export ROCP_TOOL_LIB=$TLIB_PATH/libtool.so # Enabling HSA dispatches intercepting by ROC PRofiler export ROCP_HSA_INTERCEPT=1 # Disabling internal ROC Profiler proxy queue (simple version supported for testing purposes) @@ -129,7 +129,7 @@ usage() { echo " >" echo "" echo " -o - output CSV file [.csv]" - echo " -d - directory where profiler store profiling data including thread treaces [/tmp]" + echo " -d - directory where profiler store profiling data including traces [/tmp]" echo " The data directory is renoving autonatically if the directory is matching the temporary one, which is the default." echo " -t - to change the temporary directory [/tmp]" echo " By changing the temporary directory you can prevent removing the profiling data from /tmp or enable removing from not '/tmp' directory." @@ -201,7 +201,6 @@ run() { fi API_TRACE="" - PRELOAD_LIBS="" if [ "$HSA_TRACE" = 1 ] ; then API_TRACE="hsa" fi @@ -211,14 +210,11 @@ run() { else API_TRACE="all" fi - if [ -z "$HCC_HOME" ] ; then error "env var HCC_HOME is not defined"; fi - PRELOAD_LIBS="$PRELOAD_LIBS $HCC_HOME/lib/libmcwamp_hsa.so" fi if [ -n "$API_TRACE" ] ; then API_TRACE=$(echo $API_TRACE | sed 's/all//') if [ -n "$API_TRACE" ] ; then export ROCTRACER_DOMAIN=$API_TRACE; fi - export HSA_TOOLS_LIB="libtracer_tool.so libroctracer64.so $HSA_TOOLS_LIB" - PRELOAD_LIBS="$PRELOAD_LIBS $HSA_TOOLS_LIB" + export HSA_TOOLS_LIB="$RPL_PATH/libroctracer64.so $TLIB_PATH/libtracer_tool.so $HSA_TOOLS_LIB" fi redirection_cmd="" @@ -228,7 +224,7 @@ run() { fi #unset ROCP_OUTPUT_DIR - CMD_LINE="LD_PRELOAD='$PRELOAD_LIBS' $APP_CMD $redirection_cmd" + CMD_LINE="$APP_CMD $redirection_cmd" eval "$CMD_LINE" } diff --git a/bin/tblextr.py b/bin/tblextr.py index 329ab0d8..818e85f2 100755 --- a/bin/tblextr.py +++ b/bin/tblextr.py @@ -56,6 +56,13 @@ def fatal(msg): sys.stderr.write(sys.argv[0] + ": " + msg + "\n"); sys.exit(1) + +dbglog_count = 0 +def dbglog(msg): + global dbglog_count + dbglog_count += 1 + sys.stderr.write(sys.argv[0] + ": " + msg + "\n"); + fatal("error") ############################################################# # parse results method @@ -224,7 +231,7 @@ def fill_api_db(table_name, db, indir, api_name, api_pid, dep_pid, dep_list, dep dep_tid_list.append(int(rec_vals[3])) dep_id_list.append(record_id) record_id += 1 - else: fatal("hsa bad record") + else: fatal(api_name + " bad record: '" + record + "'") for (tid, from_ns) in dep_list: db.insert_entry(table_handle, [from_ns, from_ns, api_pid, tid, 'hsa_dispatch', '', record_id]) @@ -267,7 +274,7 @@ def fill_copy_db(table_name, db, indir): else: fatal("bad async-copy entry") rec_vals.append(m.group(1)) db.insert_entry(table_handle, rec_vals) - else: fatal("async-copy bad record") + else: fatal("async-copy bad record: '" + record + "'") dep_dict[COPY_PID]['to'] = dep_to_us_dict ############################################################# @@ -313,7 +320,7 @@ def fill_ops_db(table_name, db, indir): dep_dict[gpu_pid]['to'] = {} dep_dict[gpu_pid]['to'][corr_id] = int(rec_vals[0]) / 1000 dep_dict[gpu_pid]['bsp'] = OPS_PID - else: fatal("async-copy bad record") + else: fatal("hcc ops bad record: '" + record + "'") return filtr ############################################################# diff --git a/cmake_modules/env.cmake b/cmake_modules/env.cmake index 6bf6ed45..44fb0cd0 100644 --- a/cmake_modules/env.cmake +++ b/cmake_modules/env.cmake @@ -56,6 +56,8 @@ set ( CMAKE_SHARED_LINKER_FLAGS "-Wl,-Bdynamic -Wl,-z,noexecstack" ) set ( CMAKE_SKIP_BUILD_RPATH TRUE ) +add_definitions ( -DNEW_TRACE_API=1 ) + ## CLANG options if ( "$ENV{CXX}" STREQUAL "/usr/bin/clang++" ) set ( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -ferror-limit=1000000" ) diff --git a/inc/rocprofiler.h b/inc/rocprofiler.h index 1e74c464..b59acfdf 100644 --- a/inc/rocprofiler.h +++ b/inc/rocprofiler.h @@ -27,8 +27,7 @@ THE SOFTWARE. // The goal of the implementation is to provide a HW specific low-level // performance analysis interface for profiling of GPU compute applications. // The profiling includes HW performance counters (PMC) with complex -// performance metrics and thread traces (SQTT). The profiling is supported -// by the SQTT, PMC and Callback APIs. +// performance metrics and traces. // // The library can be used by a tool library loaded by HSA runtime or by // higher level HW independent performance analysis API like PAPI. @@ -66,8 +65,8 @@ uint32_t rocprofiler_version_minor(); typedef struct { uint32_t intercept_mode; uint32_t memcopy_tracking; - uint32_t sqtt_size; - uint32_t sqtt_local; + uint32_t trace_size; + uint32_t trace_local; uint64_t timeout; uint32_t timestamp_on; } rocprofiler_settings_t; diff --git a/src/core/context.h b/src/core/context.h index a59effd0..856c7024 100644 --- a/src/core/context.h +++ b/src/core/context.h @@ -83,7 +83,7 @@ class Group { Group(const util::AgentInfo* agent_info, Context* context, const uint32_t& index) : pmc_profile_(agent_info), - sqtt_profile_(agent_info), + trace_profile_(agent_info), n_profiles_(0), refs_(1), context_(context), @@ -97,7 +97,7 @@ class Group { pmc_profile_.Insert(info); break; case ROCPROFILER_FEATURE_KIND_TRACE: - sqtt_profile_.Insert(info); + trace_profile_.Insert(info); break; default: EXC_RAISING(HSA_STATUS_ERROR, "bad rocprofiler feature kind (" << kind << ")"); @@ -107,21 +107,21 @@ class Group { hsa_status_t Finalize() { hsa_status_t status = pmc_profile_.Finalize(start_vector_, stop_vector_, read_vector_); if (status == HSA_STATUS_SUCCESS) { - status = sqtt_profile_.Finalize(start_vector_, stop_vector_, read_vector_); + status = trace_profile_.Finalize(start_vector_, stop_vector_, read_vector_); } if (status == HSA_STATUS_SUCCESS) { if (!pmc_profile_.Empty()) ++n_profiles_; - if (!sqtt_profile_.Empty()) ++n_profiles_; + if (!trace_profile_.Empty()) ++n_profiles_; } return status; } void GetProfiles(profile_vector_t& vec) { pmc_profile_.GetProfiles(vec); - sqtt_profile_.GetProfiles(vec); + trace_profile_.GetProfiles(vec); } - void GetTraceProfiles(profile_vector_t& vec) { sqtt_profile_.GetProfiles(vec); } + void GetTraceProfiles(profile_vector_t& vec) { trace_profile_.GetProfiles(vec); } info_vector_t& GetInfoVector() { return info_vector_; } const pkt_vector_t& GetStartVector() const { return start_vector_; } @@ -137,7 +137,7 @@ class Group { private: PmcProfile pmc_profile_; - SqttProfile sqtt_profile_; + TraceProfile trace_profile_; info_vector_t info_vector_; pkt_vector_t start_vector_; pkt_vector_t stop_vector_; @@ -361,9 +361,9 @@ class Context { rocprofiler_feature_t* info = &info_array[i]; const rocprofiler_feature_kind_t kind = info->kind; const char* name = info->name; - if (!name) EXC_RAISING(HSA_STATUS_ERROR, "input feature name is NULL"); - info_map_[name] = info; if (kind == ROCPROFILER_FEATURE_KIND_METRIC) { + if (name == NULL) EXC_RAISING(HSA_STATUS_ERROR, "metric name is NULL"); + info_map_[name] = info; auto ret = metrics_map_.insert({name, NULL}); if (!ret.second) EXC_RAISING(HSA_STATUS_ERROR, "input metric '" << name @@ -435,7 +435,19 @@ class Context { set_[group_index].Insert(profile_info_t{event, NULL, 0, info}); } } else if (kind == ROCPROFILER_FEATURE_KIND_TRACE) { // Processing traces features - set_[0].Insert(profile_info_t{NULL, info->parameters, info->parameter_count, info}); + if (info->parameters != NULL) { + set_[0].Insert(profile_info_t{NULL, info->parameters, info->parameter_count, info}); + } else { + const Metric* metric = metrics_->Get(name); + if (metric == NULL) + EXC_RAISING(HSA_STATUS_ERROR, "input metric '" << name << "' is not found"); + counters_vec_t counters_vec = metric->GetCounters(); + if (counters_vec.size() != 1) + EXC_RAISING(HSA_STATUS_ERROR, "trace bad metric '" << name << "' is not base counter"); + const counter_t* counter = counters_vec[0]; + const event_t* event = &(counter->event); + set_[0].Insert(profile_info_t{event, NULL, 0, info}); + } } else { EXC_RAISING(HSA_STATUS_ERROR, "bad rocprofiler feature kind (" << kind << ")"); } @@ -484,15 +496,15 @@ class Context { if (ainfo_data->sample_id == 0) rinfo->data.result_int64 = 0; rinfo->data.result_int64 += ainfo_data->pmc_data.result; rinfo->data.kind = ROCPROFILER_DATA_KIND_INT64; - } else if (ainfo_type == HSA_VEN_AMD_AQLPROFILE_INFO_SQTT_DATA) { + } else if (ainfo_type == HSA_VEN_AMD_AQLPROFILE_INFO_TRACE_DATA) { if (rinfo->data.result_bytes.copy) { - const bool sqtt_local = SqttProfile::IsLocal(); + const bool trace_local = TraceProfile::IsLocal(); util::HsaRsrcFactory* hsa_rsrc = &util::HsaRsrcFactory::Instance(); if (sample_id == 0) { const uint32_t output_buffer_size = profile->output_buffer.size; const uint32_t output_buffer_size64 = profile->output_buffer.size / sizeof(uint64_t); const util::AgentInfo* agent_info = hsa_rsrc->GetAgentInfo(profile->agent); - void* ptr = (sqtt_local) ? hsa_rsrc->AllocateSysMemory(agent_info, output_buffer_size) : + void* ptr = (trace_local) ? hsa_rsrc->AllocateSysMemory(agent_info, output_buffer_size) : calloc(output_buffer_size64, sizeof(uint64_t)); rinfo->data.result_bytes.size = output_buffer_size; rinfo->data.result_bytes.ptr = ptr; @@ -500,19 +512,19 @@ class Context { } char* result_bytes_ptr = reinterpret_cast(rinfo->data.result_bytes.ptr); const char* end = result_bytes_ptr + rinfo->data.result_bytes.size; - const char* src = reinterpret_cast(ainfo_data->sqtt_data.ptr); - uint32_t size = ainfo_data->sqtt_data.size; + const char* src = reinterpret_cast(ainfo_data->trace_data.ptr); + uint32_t size = ainfo_data->trace_data.size; char* ptr = callback_data->ptr; uint32_t* header = reinterpret_cast(ptr); char* dest = ptr + sizeof(*header); if ((dest + size) >= end) { if (dest < end) size = end - dest; - else EXC_RAISING(HSA_STATUS_ERROR, "SQTT data out of output buffer"); + else EXC_RAISING(HSA_STATUS_ERROR, "Trace data out of output buffer"); } bool suc = true; - if (sqtt_local) { + if (trace_local) { suc = hsa_rsrc->Memcpy(profile->agent, dest, src, size); } else { memcpy(dest, src, size); diff --git a/src/core/profile.h b/src/core/profile.h index 6d91192b..223e2e5e 100644 --- a/src/core/profile.h +++ b/src/core/profile.h @@ -236,21 +236,27 @@ class PmcProfile : public Profile { } }; -class SqttProfile : public Profile { +class TraceProfile : public Profile { public: static inline void SetSize(const uint32_t& size) { output_buffer_size_ = size; } static inline uint32_t GetSize() { return output_buffer_size_; } static inline void SetLocal(const bool& b) { output_buffer_local_ = b; } static inline bool IsLocal() { return output_buffer_local_; } - SqttProfile(const util::AgentInfo* agent_info) : Profile(agent_info) { - profile_.type = HSA_VEN_AMD_AQLPROFILE_EVENT_TYPE_SQTT; + TraceProfile(const util::AgentInfo* agent_info) : Profile(agent_info) { + profile_.type = HSA_VEN_AMD_AQLPROFILE_EVENT_TYPE_TRACE; } void Insert(const profile_info_t& info) { - Profile::Insert(info); - for (unsigned j = 0; j < info.parameter_count; ++j) { - Config(&profile_).Insert(info.parameters[j]); + if (info.parameters != NULL) { + Profile::Insert(info); + for (unsigned j = 0; j < info.parameter_count; ++j) { + Config(&profile_).Insert(info.parameters[j]); + } + } else if (info.event != NULL) { + Config(&profile_).Insert(*(info.event)); + } else { + EXC_ABORT(HSA_STATUS_ERROR, "invalid trace info inserted"); } } diff --git a/src/core/rocprofiler.cpp b/src/core/rocprofiler.cpp index 81e146a0..090e5492 100644 --- a/src/core/rocprofiler.cpp +++ b/src/core/rocprofiler.cpp @@ -175,16 +175,16 @@ uint32_t LoadTool() { rocprofiler_settings_t settings{}; settings.intercept_mode = (intercept_mode != 0) ? 1 : 0; - settings.sqtt_size = SqttProfile::GetSize(); - settings.sqtt_local = SqttProfile::IsLocal() ? 1: 0; + settings.trace_size = TraceProfile::GetSize(); + settings.trace_local = TraceProfile::IsLocal() ? 1: 0; settings.timeout = util::HsaRsrcFactory::GetTimeoutNs(); settings.timestamp_on = InterceptQueue::IsTrackerOn() ? 1 : 0; if (handler) handler(); else if (handler_prop) handler_prop(&settings); - SqttProfile::SetSize(settings.sqtt_size); - SqttProfile::SetLocal(settings.sqtt_local != 0); + TraceProfile::SetSize(settings.trace_size); + TraceProfile::SetLocal(settings.trace_local != 0); util::HsaRsrcFactory::SetTimeoutNs(settings.timeout); InterceptQueue::TrackerOn(settings.timestamp_on != 0); if (settings.intercept_mode != 0) intercept_mode = DISPATCH_INTERCEPT_MODE; @@ -384,8 +384,8 @@ hsa_status_t hsa_amd_memory_async_copy_rect_interceptor( } rocprofiler_properties_t rocprofiler_properties; -uint32_t SqttProfile::output_buffer_size_ = 0x2000000; // 32M -bool SqttProfile::output_buffer_local_ = true; +uint32_t TraceProfile::output_buffer_size_ = 0x2000000; // 32M +bool TraceProfile::output_buffer_local_ = true; std::atomic Tracker::instance_{}; Tracker::mutex_t Tracker::glob_mutex_; Tracker::counter_t Tracker::counter_ = 0; diff --git a/src/core/types.h b/src/core/types.h index ef8600f0..c72bb343 100644 --- a/src/core/types.h +++ b/src/core/types.h @@ -23,6 +23,8 @@ THE SOFTWARE. #ifndef SRC_CORE_TYPES_H_ #define SRC_CORE_TYPES_H_ +#include + #include namespace rocprofiler { @@ -33,6 +35,16 @@ typedef hsa_ven_amd_aqlprofile_profile_t profile_t; typedef hsa_ext_amd_aql_pm4_packet_t packet_t; typedef uint32_t packet_word_t; typedef uint64_t timestamp_t; + +inline std::ostream& operator<< (std::ostream& out, const event_t& event) { + out << "[block_name(" << event.block_name << "). block_index(" << event.block_index << "). counter_id(" << event.counter_id << ")]"; + return out; +} +inline std::ostream& operator<< (std::ostream& out, const parameter_t& parameter) { + out << "[parameter_name(" << parameter.parameter_name << "). value(" << parameter.value << ")]"; + return out; +} + } // namespace rocprofiler #endif // SRC_CORE_TYPES_H_ diff --git a/test/app/standalone_test.cpp b/test/app/standalone_test.cpp index f6fc965e..b173c4d3 100644 --- a/test/app/standalone_test.cpp +++ b/test/app/standalone_test.cpp @@ -104,6 +104,23 @@ int main() { feature[7].name = "TCC_MISS_sum"; feature[8].kind = ROCPROFILER_FEATURE_KIND_METRIC; feature[8].name = "WRITE_SIZE"; +// feature[8].kind = ROCPROFILER_FEATURE_KIND_METRIC; +// feature[8].name = "TCC_EA_WRREQ_sum"; +// feature[9].kind = ROCPROFILER_FEATURE_KIND_METRIC; +// feature[9].name = "TCC_EA_WRREQ_64B_sum"; +#if 0 + // Tracing parameters + const unsigned parameter_count = 2; + rocprofiler_parameter_t parameters[parameter_count]; + feature[2].name = "THREAD_TRACE"; + feature[2].kind = ROCPROFILER_FEATURE_KIND_TRACE; + feature[2].parameters = parameters; + feature[2].parameter_count = parameter_count; + parameters[0].parameter_name = HSA_VEN_AMD_AQLPROFILE_PARAMETER_NAME_MASK; + parameters[0].value = 0; + parameters[1].parameter_name = HSA_VEN_AMD_AQLPROFILE_PARAMETER_NAME_TOKEN_MASK; + parameters[1].value = 0; +#endif // Instantiate HSA resources HsaRsrcFactory::Create(); diff --git a/test/run.sh b/test/run.sh index ed5bbe9a..d1aa2b88 100755 --- a/test/run.sh +++ b/test/run.sh @@ -22,21 +22,31 @@ # THE SOFTWARE. ################################################################################ +test_filter=-1 +if [ -n "$1" ] ; then + test_filter=$1 +fi + # test check routin test_status=0 test_number=0 +xeval_test() { + test_number=$test_number +} eval_test() { label=$1 cmdline=$2 - echo "$label: \"$cmdline\"" - eval "$cmdline" - if [ $? != 0 ] ; then - echo "$label: FAILED" - test_status=$(($test_status + 1)) - else - echo "$label: PASSED" + if [ $test_filter = -1 -o $test_filter = $test_number ] ; then + echo "$label: \"$cmdline\"" + eval "$cmdline" + if [ $? != 0 ] ; then + echo "$label: FAILED" + test_status=$(($test_status + 1)) + else + echo "$label: PASSED" + fi fi - test_number=$(($test_number + 1)) + test_number=$((test_number + 1)) } # enable tools load failure reporting diff --git a/test/tool/tool.cpp b/test/tool/tool.cpp index 0eee2348..10facbd0 100644 --- a/test/tool/tool.cpp +++ b/test/tool/tool.cpp @@ -134,8 +134,10 @@ static uint32_t CTX_OUTSTANDING_MAX = 0; static uint32_t CTX_OUTSTANDING_MON = 0; // to truncate kernel names uint32_t to_truncate_names = 0; -// local SQTT buffer -bool is_sqtt_local = true; +// local trace buffer +bool is_trace_local = true; +// SPM trace enabled +bool is_spm_trace = false; static inline uint32_t GetPid() { return syscall(__NR_getpid); } static inline uint32_t GetTid() { return syscall(__NR_gettid); } @@ -281,7 +283,7 @@ void dealloc_context_entry(context_entry_t* entry) { // Dump trace data to file void dump_sqtt_trace(const char* label, const uint32_t chunk, const void* data, const uint32_t& size) { if (result_prefix != NULL) { - // Open SQTT file + // Open file std::ostringstream oss; oss << result_prefix << "/thread_trace_" << label << "_se" << chunk << ".out"; FILE* file = fopen(oss.str().c_str(), "w"); @@ -298,11 +300,36 @@ void dump_sqtt_trace(const char* label, const uint32_t chunk, const void* data, fprintf(file, "%04x\n", ptr[i]); } - // Close SQTT file + // Close file fclose(file); } } +// Dump trace data to file +void dump_spm_trace(const char* label, const void* data, const uint32_t& size) { + if (result_prefix != NULL) { + // Open trace file + std::ostringstream oss; + oss << result_prefix << "/spm_trace_" << label << ".out"; + const int fd = open(oss.str().c_str(), O_CREAT|O_WRONLY|O_TRUNC, 0666); + if (fd == -1) { + std::ostringstream errmsg; + errmsg << "open error, file '" << oss.str().c_str() << "'"; + perror(errmsg.str().c_str()); + abort(); + } + // write trace binary data + if (write(fd, data, size) == -1) { + std::ostringstream errmsg; + errmsg << "write error, file '" << oss.str().c_str() << "'"; + perror(errmsg.str().c_str()); + abort(); + } + // Close file + close(fd); + } +} + struct trace_data_arg_t { FILE* file; const char* label; @@ -314,23 +341,43 @@ hsa_status_t trace_data_cb(hsa_ven_amd_aqlprofile_info_type_t info_type, hsa_ven_amd_aqlprofile_info_data_t* info_data, void* data) { hsa_status_t status = HSA_STATUS_SUCCESS; trace_data_arg_t* arg = reinterpret_cast(data); - if (info_type == HSA_VEN_AMD_AQLPROFILE_INFO_SQTT_DATA) { - const void* data_ptr = info_data->sqtt_data.ptr; - const uint32_t data_size = info_data->sqtt_data.size; - fprintf(arg->file, " SE(%u) size(%u)\n", info_data->sample_id, data_size); + if (info_type == HSA_VEN_AMD_AQLPROFILE_INFO_TRACE_DATA) { + if (is_spm_trace) { + if (info_data->sample_id != 0) { + fatal("Only one SPM sample expected"); + } + const void* data_ptr = info_data->trace_data.ptr; + const uint32_t data_size = info_data->trace_data.size; + fprintf(arg->file, " size(%u)\n", data_size); - if (is_sqtt_local) { + if (is_trace_local == false) fatal("SPM trace supports only local trace allocation"); HsaRsrcFactory* hsa_rsrc = &HsaRsrcFactory::Instance(); const AgentInfo* agent_info = hsa_rsrc->GetAgentInfo(arg->agent); const uint32_t mem_size = data_size; void* buffer = hsa_rsrc->AllocateSysMemory(agent_info, mem_size); if(!hsa_rsrc->Memcpy(agent_info, buffer, data_ptr, mem_size)) { - fatal("SQTT data memcopy to host failed"); + fatal("Trace data memcopy to host failed"); } - dump_sqtt_trace(arg->label, info_data->sample_id, buffer, data_size); + dump_spm_trace(arg->label, buffer, data_size); HsaRsrcFactory::FreeMemory(buffer); } else { - dump_sqtt_trace(arg->label, info_data->sample_id, data_ptr, data_size); + const void* data_ptr = info_data->trace_data.ptr; + const uint32_t data_size = info_data->trace_data.size; + fprintf(arg->file, " SE(%u) size(%u)\n", info_data->sample_id, data_size); + + if (is_trace_local) { + HsaRsrcFactory* hsa_rsrc = &HsaRsrcFactory::Instance(); + const AgentInfo* agent_info = hsa_rsrc->GetAgentInfo(arg->agent); + const uint32_t mem_size = data_size; + void* buffer = hsa_rsrc->AllocateSysMemory(agent_info, mem_size); + if(!hsa_rsrc->Memcpy(agent_info, buffer, data_ptr, mem_size)) { + fatal("Trace data memcopy to host failed"); + } + dump_sqtt_trace(arg->label, info_data->sample_id, buffer, data_size); + HsaRsrcFactory::FreeMemory(buffer); + } else { + dump_sqtt_trace(arg->label, info_data->sample_id, data_ptr, data_size); + } } } else status = HSA_STATUS_ERROR; @@ -367,12 +414,12 @@ void output_results(const context_entry_t* entry, const char* label) { for (unsigned i = 0; i < p->data.result_bytes.instance_count; ++i) { const uint32_t chunk_size = *reinterpret_cast(ptr); const char* chunk_data = ptr + sizeof(uint32_t); - if (chunk_data >= end) fatal("SQTT data is out of the result buffer size"); + if (chunk_data >= end) fatal("Trace data is out of the result buffer size"); dump_sqtt_trace(label, i, chunk_data, chunk_size); const uint32_t off = align_size(chunk_size, sizeof(uint32_t)); ptr = chunk_data + off; - if (chunk_data >= end) fatal("SQTT data ptr is out of the result buffer size"); + if (chunk_data >= end) fatal("Trace data ptr is out of the result buffer size"); size += chunk_size; } fprintf(file, "size(%lu)\n", size); @@ -388,6 +435,7 @@ void output_results(const context_entry_t* entry, const char* label) { break; } default: + if (is_spm_trace) continue; fprintf(stderr, "RPL-tool: undefined data kind(%u)\n", p->data.kind); abort(); } @@ -821,19 +869,19 @@ extern "C" PUBLIC_API void OnLoadToolProp(rocprofiler_settings_t* settings) if (it != opts.end()) { CTX_OUTSTANDING_MAX = atol(it->second.c_str()); } it = opts.find("heartbeat"); if (it != opts.end()) { CTX_OUTSTANDING_MON = atol(it->second.c_str()); } - it = opts.find("sqtt-size"); + it = opts.find("trace-size"); if (it != opts.end()) { - std::string str = normalize_token(it->second, true, "option sqtt-size"); + std::string str = normalize_token(it->second, true, "option trace-size"); uint32_t multiplier = 1; switch (str.back()) { case 'K': multiplier = 1024; break; case 'M': multiplier = 1024 * 1024; break; } if (multiplier != 1) str = str.substr(0, str.length() - 1); - settings->sqtt_size = strtoull(str.c_str(), NULL, 0) * multiplier; + settings->trace_size = strtoull(str.c_str(), NULL, 0) * multiplier; } - it = opts.find("sqtt-local"); - if (it != opts.end()) { settings->sqtt_local = (it->second == "on"); } + it = opts.find("trace-local"); + if (it != opts.end()) { settings->trace_local = (it->second == "on"); } it = opts.find("memcopies"); if (it != opts.end()) { settings->memcopy_tracking = (it->second == "on"); } } @@ -850,14 +898,14 @@ extern "C" PUBLIC_API void OnLoadToolProp(rocprofiler_settings_t* settings) check_env_var("ROCP_TIMESTAMP_ON", settings->timestamp_on); // Set data timeout check_env_var("ROCP_DATA_TIMEOUT", settings->timeout); - // Set SQTT size - check_env_var("ROCP_SQTT_SIZE", settings->sqtt_size); - // Set SQTT local buffer - check_env_var("ROCP_SQTT_LOCAL", settings->sqtt_local); + // Set trace size + check_env_var("ROCP_TRACE_SIZE", settings->trace_size); + // Set trace local buffer + check_env_var("ROCP_TRACE_LOCAL", settings->trace_local); // Set memcopies tracking check_env_var("ROCP_MCOPY_TRACKING", settings->memcopy_tracking); - is_sqtt_local = settings->sqtt_local; + is_trace_local = settings->trace_local; // Printing out info char* info_symb = getenv("ROCP_INFO"); @@ -941,7 +989,11 @@ extern "C" PUBLIC_API void OnLoadToolProp(rocprofiler_settings_t* settings) range_vec->push_back(*(range_vec->begin()) + 1); } - const unsigned feature_count = metrics_vec.size(); + // Getting traces + const auto traces_list = xml->GetNodes("top.trace"); + if (traces_list.size() > 1) fatal("ROCProfiler: only one trace supported at a time"); + + const unsigned feature_count = metrics_vec.size() + traces_list.size(); rocprofiler_feature_t* features = new rocprofiler_feature_t[feature_count]; memset(features, 0, feature_count * sizeof(rocprofiler_feature_t)); From 12b64e432ee352f9f772925319bdf59a122d12c9 Mon Sep 17 00:00:00 2001 From: Nicholas Curtis Date: Wed, 10 Jul 2019 14:46:10 -0500 Subject: [PATCH 057/153] add flag to rpl_run.sh that enables the user to define custom metrics files --- bin/rpl_run.sh | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/bin/rpl_run.sh b/bin/rpl_run.sh index f29582a8..1c07997f 100755 --- a/bin/rpl_run.sh +++ b/bin/rpl_run.sh @@ -133,6 +133,7 @@ usage() { echo " The data directory is renoving autonatically if the directory is matching the temporary one, which is the default." echo " -t - to change the temporary directory [/tmp]" echo " By changing the temporary directory you can prevent removing the profiling data from /tmp or enable removing from not '/tmp' directory." + echo " -m - file defining custom metrics to use in-place of defaults." echo "" echo " --basenames - to turn on/off truncating of the kernel full function names till the base ones [off]" echo " --timestamp - to turn on/off the kernel disoatches timestamps, dispatch/begin/end/complete [off]" @@ -258,6 +259,9 @@ while [ 1 ] ; do if [ "$OUTPUT_DIR" = "-" ] ; then DATA_PATH=$TMP_DIR fi + elif [ "$1" = "-m" ] ; then + unset ROCP_METRICS + export ROCP_METRICS="$2" elif [ "$1" = "--list-basic" ] ; then export ROCP_INFO=b eval "$PKG_DIR/tool/ctrl" From b71ad35d94d0817120c8f909755bdf7b1a4ba7f1 Mon Sep 17 00:00:00 2001 From: Ye Luo Date: Mon, 29 Jul 2019 23:41:33 -0500 Subject: [PATCH 058/153] Protect application arguments with spaces. Issue: `exe -g "2 2 1"` was interpreted as `exe -g 2 2 1` Now `"exe" "-g" "2 2 1"` --- bin/rpl_run.sh | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/bin/rpl_run.sh b/bin/rpl_run.sh index 1c07997f..20880fcd 100755 --- a/bin/rpl_run.sh +++ b/bin/rpl_run.sh @@ -347,7 +347,12 @@ else csv_output=$RUN_DIR/${input_base}.csv fi -APP_CMD=$* +APP_CMD="" +for i in `seq 1 $#` +do + eval "arg=\${$i}" + APP_CMD=$APP_CMD" "\"$arg\" +done echo "RPL: profiling '$APP_CMD'" echo "RPL: input file '$INPUT_FILE'" From 21b6013aba9dc611c97ac9a92602b80011162660 Mon Sep 17 00:00:00 2001 From: Evgeny Date: Wed, 11 Sep 2019 10:19:08 -0500 Subject: [PATCH 059/153] minor cosmetic fixes --- bin/rpl_run.sh | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/bin/rpl_run.sh b/bin/rpl_run.sh index 20880fcd..d6398585 100755 --- a/bin/rpl_run.sh +++ b/bin/rpl_run.sh @@ -1,4 +1,4 @@ -#!/bin/sh +#!/bin/bash ################################################################################ # Copyright (c) 2018 Advanced Micro Devices, Inc. All rights reserved. @@ -348,10 +348,12 @@ else fi APP_CMD="" -for i in `seq 1 $#` -do +for i in `seq 1 $#`; do + if [ -n "$APP_CMD" ] ; then + APP_CMD=$APP_CMD" " + fi eval "arg=\${$i}" - APP_CMD=$APP_CMD" "\"$arg\" + APP_CMD=$APP_CMD\"$arg\" done echo "RPL: profiling '$APP_CMD'" From f77f71e7880f7179757193d6ec7158e0367b5b93 Mon Sep 17 00:00:00 2001 From: Evgeny Date: Thu, 26 Sep 2019 23:05:35 -0500 Subject: [PATCH 060/153] rocm2.7 update --- README.md | 3 +- bin/dform.py | 2 +- bin/rpl_run.sh | 45 ++++++++++++++++++++++++--- bin/sqlitedb.py | 13 ++++---- bin/tblextr.py | 38 ++++++++++++++--------- src/core/metrics.h | 6 +++- src/core/profile.h | 7 +++-- src/xml/xml.h | 76 +++++++++++++++++++++++++++++++++++++--------- test/run.sh | 1 + 9 files changed, 144 insertions(+), 47 deletions(-) diff --git a/README.md b/README.md index a3ca363b..82f525dc 100644 --- a/README.md +++ b/README.md @@ -37,10 +37,11 @@ profiling includes HW performance counters with complex performance metrics. ## To build with the current installed ROCM: ``` - To build and install to /opt/rocm/rocprofiler + export CMAKE_PREFIX_PATH=/opt/rocm/include/hsa:/opt/rocm + cd .../rocprofiler mkdir build cd build - export CMAKE_PREFIX_PATH=/opt/rocm/include/hsa:/opt/rocm cmake .. make make install diff --git a/bin/dform.py b/bin/dform.py index 5fc8d6fc..1e5c63b1 100644 --- a/bin/dform.py +++ b/bin/dform.py @@ -25,7 +25,7 @@ def gen_api_json_trace(db, table, start_us, outfile): db.execute('DROP VIEW B') def gen_ops_json_trace(db, table, base_pid, start_us, outfile): - db.execute('create view B as select "Index", Name as name, ("gpu-id" + %d) as pid, tid, (BeginNs/1000 - %d) as ts, (DurationNs/1000) as dur from %s order by ts asc;' % (base_pid, start_us, table)); + db.execute('create view B as select "Index", Name as name, ("dev-id" + %d) as pid, tid, (BeginNs/1000 - %d) as ts, (DurationNs/1000) as dur from %s order by ts asc;' % (base_pid, start_us, table)); db.dump_json('B', table, outfile) db.execute('DROP VIEW B') diff --git a/bin/rpl_run.sh b/bin/rpl_run.sh index d6398585..eeb62d20 100755 --- a/bin/rpl_run.sh +++ b/bin/rpl_run.sh @@ -26,24 +26,32 @@ time_stamp=`date +%y%m%d_%H%M%S` BIN_DIR=$(dirname $(realpath $0)) PKG_DIR=$(dirname $BIN_DIR) ROOT_DIR=$(dirname $PKG_DIR) +TT_DIR=$ROOT_DIR/roctracer RUN_DIR=`pwd` TMP_DIR="/tmp" DATA_DIR="rpl_data_${time_stamp}_$$" RPL_PATH=$PKG_DIR/lib TLIB_PATH=$PKG_DIR/tool +TTLIB_PATH=$TT_DIR/tool -# PATH to custom HSA and OpenCl runtimes -HSA_PATH=$PKG_DIR/lib/hsa +# Default HIP path +if [ -z "$HIP_PATH" ] ; then + export HIP_PATH=/opt/rocm/hip +fi +# Default HCC path +if [ -z "$HCC_HOME" ] ; then + export HCC_HOME=/opt/rocm/hcc +fi # runtime API trace HSA_TRACE=0 +SYS_TRACE=0 HIP_TRACE=0 # Generate stats GEN_STATS=0 -export LD_LIBRARY_PATH=$HSA_PATH:$LD_LIBRARY_PATH export PATH=.:$PATH # enable error logging @@ -143,6 +151,7 @@ usage() { echo "" echo " --stats - generating kernel execution stats, file .stats.csv" echo " --hsa-trace - to trace HSA, generates API execution stats and JSON file chrome-tracing compatible" + echo " --sys-trace - to trace HIP/HSA APIs and GPU activity, generates stats and JSON trace chrome-tracing compatible" echo " --hip-trace - to trace HIP, generates API execution stats and JSON file chrome-tracing compatible" echo " Generated files: .hsa_stats.txt .json" echo " Traced API list can be set by input .txt or .xml files." @@ -199,12 +208,17 @@ run() { fi fi mkdir -p "$ROCP_OUTPUT_DIR" + + OUTPUT_LIST="$OUTPUT_LIST $ROCP_OUTPUT_DIR/results.txt" fi API_TRACE="" if [ "$HSA_TRACE" = 1 ] ; then API_TRACE="hsa" fi + if [ "$SYS_TRACE" = 1 ] ; then + API_TRACE="sys" + fi if [ "$HIP_TRACE" = 1 ] ; then if [ -z "$API_TRACE" ] ; then API_TRACE="hip"; @@ -215,12 +229,14 @@ run() { if [ -n "$API_TRACE" ] ; then API_TRACE=$(echo $API_TRACE | sed 's/all//') if [ -n "$API_TRACE" ] ; then export ROCTRACER_DOMAIN=$API_TRACE; fi - export HSA_TOOLS_LIB="$RPL_PATH/libroctracer64.so $TLIB_PATH/libtracer_tool.so $HSA_TOOLS_LIB" + if [ "$API_TRACE" = "hip" -o "$API_TRACE" = "sys" ] ; then + OUTPUT_LIST="$ROCP_OUTPUT_DIR/" + fi + export HSA_TOOLS_LIB="$TTLIB_PATH/libtracer_tool.so" fi redirection_cmd="" if [ -n "$ROCP_OUTPUT_DIR" ] ; then - OUTPUT_LIST="$OUTPUT_LIST $ROCP_OUTPUT_DIR/results.txt" redirection_cmd="2>&1 | tee $ROCP_OUTPUT_DIR/log.txt" fi @@ -229,6 +245,19 @@ run() { eval "$CMD_LINE" } +merge_output() { + output_dir=$(echo "$1" | sed "s/\/[^\/]*$//") + for file_name in `ls $output_dir` ; do + output_name=$(echo $file_name | sed -n "/\.txt$/ s/^[0-9]*_//p") + if [ -n "$output_name" ] ; then + trace_file=$output_dir/$file_name + output_file=$output_dir/$output_name + touch $output_file + cat $trace_file >> $output_file + fi + done +} + # main echo "RPL: on '$time_stamp' from '$PKG_DIR' in '$RUN_DIR'" # Parsing arguments @@ -301,6 +330,11 @@ while [ 1 ] ; do export ROCP_TIMESTAMP_ON=1 GEN_STATS=1 HSA_TRACE=1 + elif [ "$1" = "--sys-trace" ] ; then + ARG_VAL=0 + export ROCP_TIMESTAMP_ON=1 + GEN_STATS=1 + SYS_TRACE=1 elif [ "$1" = "--hip-trace" ] ; then ARG_VAL=0 export ROCP_TIMESTAMP_ON=1 @@ -398,6 +432,7 @@ done if [ -n "$csv_output" ] ; then if [ "$GEN_STATS" = "1" ] ; then db_output=$(echo $csv_output | sed "s/\.csv/.db/") + merge_output $OUTPUT_LIST python $BIN_DIR/tblextr.py $db_output $OUTPUT_LIST else python $BIN_DIR/tblextr.py $csv_output $OUTPUT_LIST diff --git a/bin/sqlitedb.py b/bin/sqlitedb.py index 295fe7a7..e02d4136 100644 --- a/bin/sqlitedb.py +++ b/bin/sqlitedb.py @@ -121,12 +121,13 @@ def flow_json(self, base_id, from_pid, from_tid, from_us_list, to_pid, to_us_dic for ind in range(len(from_tid)): if (len(corr_id_list) != 0): corr_id = corr_id_list[ind] else: corr_id = ind - from_ts = from_us_list[ind] - start_us - to_ts = to_us_dict[corr_id] - start_us - if from_ts > to_ts: from_ts = to_ts - fd.write(',{"ts":%d,"ph":"s","cat":"DataFlow","id":%d,"pid":%s,"tid":%s,"name":"dep"}\n' % (from_ts, dep_id, str(from_pid), from_tid[ind])) - fd.write(',{"ts":%d,"ph":"t","cat":"DataFlow","id":%d,"pid":%s,"tid":0,"name":"dep"}\n' % (to_ts, dep_id, str(to_pid))) - dep_id += 1 + if corr_id in to_us_dict: + from_ts = from_us_list[ind] - start_us + to_ts = to_us_dict[corr_id] - start_us + if from_ts > to_ts: from_ts = to_ts + fd.write(',{"ts":%d,"ph":"s","cat":"DataFlow","id":%d,"pid":%s,"tid":%s,"name":"dep"}\n' % (from_ts, dep_id, str(from_pid), from_tid[ind])) + fd.write(',{"ts":%d,"ph":"t","cat":"DataFlow","id":%d,"pid":%s,"tid":0,"name":"dep"}\n' % (to_ts, dep_id, str(to_pid))) + dep_id += 1 def dump_json(self, table_name, data_name, file_name): if not re.search(r'\.json$', file_name): diff --git a/bin/tblextr.py b/bin/tblextr.py index 818e85f2..057e984a 100755 --- a/bin/tblextr.py +++ b/bin/tblextr.py @@ -68,7 +68,7 @@ def dbglog(msg): # parse results method def parse_res(infile): global max_gpu_id - if not os.path.isfile(infile): fatal("Error: input file '" + infile + "' not found") + if not os.path.isfile(infile): return # fatal("Error: input file '" + infile + "' not found") inp = open(infile, 'r') beg_pattern = re.compile("^dispatch\[(\d*)\], (.*) kernel-name\(\"([^\"]*)\"\)") @@ -256,6 +256,8 @@ def fill_copy_db(table_name, db, indir): ptrn_val = re.compile(r'(\d+):(\d+) (.*)$') ptrn_id = re.compile(r'^async-copy(\d+)$') + if not os.path.isfile(file_name): return 0 + if not COPY_PID in dep_dict: dep_dict[COPY_PID] = {} dep_to_us_dict = {} @@ -325,11 +327,12 @@ def fill_ops_db(table_name, db, indir): return filtr ############################################################# # main -if (len(sys.argv) < 3): fatal("Usage: " + sys.argv[0] + " ") +if (len(sys.argv) < 2): fatal("Usage: " + sys.argv[0] + " ") outfile = sys.argv[1] infiles = sys.argv[2:] indir = re.sub(r'\/[^\/]*$', r'', infiles[0]) +inext = re.sub(r'^[^\.]*', r'', infiles[0]) dbfile = '' csvfile = '' @@ -342,9 +345,9 @@ def fill_ops_db(table_name, db, indir): else: fatal("Bad output file '" + outfile + "'") -for f in infiles: parse_res(f) -if len(var_table) == 0: sys.exit(1) -merge_table() +if inext == '.txt': + for f in infiles: parse_res(f) + if len(var_table) != 0: merge_table() if dbfile == '': dump_csv(csvfile) @@ -356,8 +359,7 @@ def fill_ops_db(table_name, db, indir): db = SQLiteDB(dbfile) hsa_trace_found = fill_api_db('HSA', db, indir, 'hsa', HSA_PID, COPY_PID, kern_dep_list, {}, 0) - if hsa_trace_found: - fill_copy_db('COPY', db, indir) + hsa_activity_found = fill_copy_db('COPY', db, indir) ops_filtr = fill_ops_db('OPS', db, indir) hip_trace_found = fill_api_db('HIP', db, indir, 'hip', HIP_PID, OPS_PID, [], ops_filtr, 1) @@ -370,6 +372,7 @@ def fill_ops_db(table_name, db, indir): if hsa_trace_found: db.label_json(HSA_PID, "CPU HSA API", jsonfile) + if hsa_activity_found: db.label_json(COPY_PID, "COPY", jsonfile) if hip_trace_found: @@ -379,17 +382,19 @@ def fill_ops_db(table_name, db, indir): for ind in range(0, int(max_gpu_id) + 1): db.label_json(int(ind) + int(GPU_BASE_PID), "GPU" + str(ind), jsonfile) - dform.post_process_data(db, 'A', csvfile) - dform.gen_table_bins(db, 'A', statfile, 'KernelName', 'DurationNs') - if hsa_trace_found and 'BeginNs' in var_list: - dform.gen_kernel_json_trace(db, 'A', GPU_BASE_PID, START_US, jsonfile) + if len(var_table) != 0: + dform.post_process_data(db, 'A', csvfile) + dform.gen_table_bins(db, 'A', statfile, 'KernelName', 'DurationNs') + if hsa_trace_found and 'BeginNs' in var_list: + dform.gen_kernel_json_trace(db, 'A', GPU_BASE_PID, START_US, jsonfile) if hsa_trace_found: statfile = re.sub(r'stats', r'hsa_stats', statfile) dform.post_process_data(db, 'HSA') dform.gen_table_bins(db, 'HSA', statfile, 'Name', 'DurationNs') dform.gen_api_json_trace(db, 'HSA', START_US, jsonfile) - + + if hsa_activity_found: dform.post_process_data(db, 'COPY') dform.gen_api_json_trace(db, 'COPY', START_US, jsonfile) @@ -398,7 +403,7 @@ def fill_ops_db(table_name, db, indir): dform.post_process_data(db, 'HIP') dform.gen_table_bins(db, 'HIP', statfile, 'Name', 'DurationNs') dform.gen_api_json_trace(db, 'HIP', START_US, jsonfile) - + dform.post_process_data(db, 'OPS') dform.gen_ops_json_trace(db, 'OPS', GPU_BASE_PID, START_US, jsonfile) @@ -414,12 +419,15 @@ def fill_ops_db(table_name, db, indir): dep_id = 0 for (to_pid, dep_str) in dep_dict.items(): if 'inv' in dep_str: continue + if not 'to' in dep_str: continue + + to_us_dict = dep_str['to'] + from_us_list = dep_str['from'] from_pid = dep_str['pid'] tid_list = dep_str['tid'] - from_us_list = dep_str['from'] - to_us_dict = dep_str['to'] corr_id_list = [] if 'id' in dep_str: corr_id_list = dep_str['id'] + db.flow_json(dep_id, from_pid, tid_list, from_us_list, to_pid, to_us_dict, corr_id_list, START_US, jsonfile) dep_id += len(tid_list) diff --git a/src/core/metrics.h b/src/core/metrics.h index cb55d189..f1532dcf 100644 --- a/src/core/metrics.h +++ b/src/core/metrics.h @@ -184,7 +184,11 @@ class MetricsDict { xml_->AddConst("top.const.metric", "SIMD_NUM", agent_info->simds_per_cu * agent_info->cu_num); xml_->AddConst("top.const.metric", "SE_NUM", agent_info->se_num); ImportMetrics(agent_info, "const"); - ImportMetrics(agent_info, agent_info->gfxip); + if (std::string("gfx906") == agent_info->name) { + ImportMetrics(agent_info, agent_info->name); + } else { + ImportMetrics(agent_info, agent_info->gfxip); + } ImportMetrics(agent_info, "global"); } } diff --git a/src/core/profile.h b/src/core/profile.h index 223e2e5e..9ed03375 100644 --- a/src/core/profile.h +++ b/src/core/profile.h @@ -140,13 +140,14 @@ class Profile { if (status != HSA_STATUS_SUCCESS) AQL_EXC_RAISING(status, "aqlprofile_start"); status = api->hsa_ven_amd_aqlprofile_stop(&profile_, &stop); if (status != HSA_STATUS_SUCCESS) AQL_EXC_RAISING(status, "aqlprofile_stop"); + hsa_status_t rd_status = HSA_STATUS_ERROR; #ifdef AQLPROF_NEW_API - hsa_status_t rd_status = api->hsa_ven_amd_aqlprofile_read(&profile_, &read); + if (profile_.type == HSA_VEN_AMD_AQLPROFILE_EVENT_TYPE_PMC) { + rd_status = api->hsa_ven_amd_aqlprofile_read(&profile_, &read); + } #if 0 // Read API returns error if disabled if (rd_status != HSA_STATUS_SUCCESS) AQL_EXC_RAISING(status, "aqlprofile_read"); #endif -#else - hsa_status_t rd_status = HSA_STATUS_ERROR; #endif // Set completion signal diff --git a/src/xml/xml.h b/src/xml/xml.h index 933cd2b6..31ed100b 100644 --- a/src/xml/xml.h +++ b/src/xml/xml.h @@ -49,6 +49,7 @@ class Xml { std::string tag; nodes_t nodes; opts_t opts; + const level_t* copy; }; typedef std::vector nodes_vec_t; typedef std::map map_t; @@ -239,7 +240,7 @@ class Xml { if (error) { fprintf(stderr, "XML PreProcess failed, line '%s'\n", buf); - exit(1); + abort(); } lseek(fd_, 0, SEEK_SET); @@ -252,9 +253,9 @@ class Xml { token_t token = (remainder.size()) ? remainder : NextToken(); remainder.clear(); - // token_t token1 = token; - // token1.push_back('\0'); - // std::cout << "> " << &token1[0] << std::endl; + // token_t token1 = token; + // token1.push_back('\0'); + // std::cout << ">>> " << &token1[0] << std::endl; // End of file if (token.size() == 0) break; @@ -312,14 +313,18 @@ class Xml { if (token[j] == '=') break; if (j == token.size()) BadFormat(token); token[j] = '\0'; - const char* key = &token[0]; - const char* value = &token[j + 1]; - AddOption(key, value); + const std::string key = &token[0]; + const std::string value = &token[j + 1]; + if (key == "base") { + Inherit(value); + } else { + AddOption(key, value); + } } break; default: std::cout << "XML parser error: wrong state: " << state_ << std::endl; - exit(1); + abort(); } } } @@ -406,11 +411,11 @@ class Xml { token.push_back('\0'); std::cout << "Error: " << file_name_ << ", line " << file_line_ << ", bad XML token '" << &token[0] << "'" << std::endl; - exit(1); + abort(); } void AddLevel(const std::string& tag) { - level_t* level = new level_t; + level_t* level = new level_t{}; level->tag = tag; if (level_) { level_->nodes.push_back(level); @@ -418,11 +423,7 @@ class Xml { } level_ = level; - std::string global_tag; - for (level_t* level : stack_) { - global_tag += level->tag + "."; - } - global_tag += tag; + std::string global_tag = GlobalTag(tag); (*map_)[global_tag].push_back(level_); } @@ -431,8 +432,53 @@ class Xml { stack_.pop_back(); } + void Copy(const level_t* from, level_t* to) { + level_t* level = to; + if (level == NULL) { + AddLevel(from->tag); + level = level_; + level->copy = from; + } + level->opts = from->opts; + + for (auto node : from->nodes) { + bool found = false; + const std::string global_tag = GlobalTag(level->tag) + "." + node->tag; + for (auto item : (*map_)[global_tag]) { + if (node == item->copy) { + found = true; + break; + } + } + if (found == false) Copy(node, NULL); + } + + if (to == NULL) UpLevel(); + } + + void Inherit(const std::string& tag) { + std::string global_tag = GlobalTag(tag); + auto it = map_->find(global_tag); + if (it == map_->end()) { + fprintf(stderr, "Node \"%s\": Base not found \"%s\"\n", level_->tag.c_str(), tag.c_str()); + abort(); + } + for (auto node : it->second) { + Copy(node, level_); + } + } + std::string CurrentLevel() const { return level_->tag; } + std::string GlobalTag(const std::string& tag) const { + std::string global_tag; + for (level_t* level : stack_) { + global_tag += level->tag + "."; + } + global_tag += tag; + return global_tag; + } + void AddOption(const std::string& key, const std::string& value) { level_->opts[key] = value; } const std::string file_name_; diff --git a/test/run.sh b/test/run.sh index d1aa2b88..be59b128 100755 --- a/test/run.sh +++ b/test/run.sh @@ -22,6 +22,7 @@ # THE SOFTWARE. ################################################################################ +# test filter input test_filter=-1 if [ -n "$1" ] ; then test_filter=$1 From c30ee0e83d177f178c1b6d5fe58121e878d0514c Mon Sep 17 00:00:00 2001 From: Evgeny Date: Thu, 26 Sep 2019 23:35:54 -0500 Subject: [PATCH 061/153] 2.8 update --- bin/rpl_run.sh | 5 +++- src/core/metrics.h | 58 ++++++++++++++++++++++++++++-------- src/core/tracker.h | 2 +- src/util/hsa_rsrc_factory.h | 2 +- src/util/logger.h | 2 +- src/xml/xml.h | 57 ++++++++++++++++++++--------------- test/app/test.cpp | 2 +- test/ctrl/test_hsa.cpp | 2 +- test/run.sh | 4 ++- test/tool/gfx_metrics.xml | 34 +++++++++++++++++++++ test/tool/metrics.xml | 55 ++++++++++++++++++++++++++++++---- test/tool/tool.cpp | 1 - test/util/hsa_rsrc_factory.h | 2 +- 13 files changed, 174 insertions(+), 52 deletions(-) diff --git a/bin/rpl_run.sh b/bin/rpl_run.sh index eeb62d20..2e0ba8ba 100755 --- a/bin/rpl_run.sh +++ b/bin/rpl_run.sh @@ -81,6 +81,7 @@ fatal() { echo "$0: Error: $1" echo "" usage + exit 1 } error() { @@ -420,11 +421,13 @@ if [ -n "$csv_output" ] ; then rm -f $csv_output fi +RET=0 for name in $input_list; do run $name $OUTPUT_DIR $APP_CMD if [ -n "$ROCPROFILER_SESS" -a -e "$ROCPROFILER_SESS/error" ] ; then echo "Error found, profiling aborted." csv_output="" + RET=1 break fi done @@ -450,4 +453,4 @@ if [ "$DATA_PATH" = "$TMP_DIR" ] ; then fi fi -exit 0 +exit $RET diff --git a/src/core/metrics.h b/src/core/metrics.h index f1532dcf..547156de 100644 --- a/src/core/metrics.h +++ b/src/core/metrics.h @@ -30,12 +30,14 @@ THE SOFTWARE. #include #include #include +#include #include #include #include "core/types.h" #include "util/exception.h" #include "util/hsa_rsrc_factory.h" +#include "util/logger.h" #include "xml/expr.h" #include "xml/xml.h" @@ -186,6 +188,8 @@ class MetricsDict { ImportMetrics(agent_info, "const"); if (std::string("gfx906") == agent_info->name) { ImportMetrics(agent_info, agent_info->name); + } else if (std::string("gfx908") == agent_info->name) { + ImportMetrics(agent_info, agent_info->name); } else { ImportMetrics(agent_info, agent_info->gfxip); } @@ -210,9 +214,16 @@ class MetricsDict { } void ImportMetrics(const util::AgentInfo* agent_info, const std::string& scope) { - auto metrics_list = xml_->GetNodes("top." + scope + ".metric"); + auto arr = xml_->GetNodes("top." + scope + ".metric"); + xml::Xml::node_list_t metrics_list(arr.begin(), arr.end()); + uint32_t metrics_number = metrics_list.size(); + bool do_lookup = true; if (!metrics_list.empty()) { - for (auto node : metrics_list) { + uint32_t it_number = metrics_number; + auto it = metrics_list.begin(); + auto end = metrics_list.end(); + while (it != end) { + auto node = *it; const std::string name = node->opts["name"]; const std::string expr_str = node->opts["expr"]; std::string descr = node->opts["descr"]; @@ -242,20 +253,41 @@ class MetricsDict { AddMetric(name, alias, counter); } } else { - xml::Expr* expr_obj = new xml::Expr(expr_str, new ExprCache(&cache_)); + xml::Expr* expr_obj = NULL; + try { + expr_obj = new xml::Expr(expr_str, new ExprCache(&cache_)); + } catch(const xml::exception_t& exc) { + if (do_lookup) metrics_list.push_back(node); + else throw(exc); + } + if (expr_obj) { #if 0 - std::cout << "# " << descr << std::endl; - std::cout << name << "=" << expr_obj->String() << "\n" << std::endl; + std::cout << "# " << descr << std::endl; + std::cout << name << "=" << expr_obj->String() << "\n" << std::endl; #endif - counters_vec_t counters_vec; - for (const std::string var : expr_obj->GetVars()) { - auto it = cache_.find(var); - if (it == cache_.end()) - EXC_RAISING(HSA_STATUS_ERROR, "Bad metric '" << name << "', var '" << var - << "' is not found"); - it->second->GetCounters(counters_vec); + counters_vec_t counters_vec; + for (const std::string var : expr_obj->GetVars()) { + auto it = cache_.find(var); + if (it == cache_.end()) { + EXC_RAISING(HSA_STATUS_ERROR, "Bad metric '" << name << "', var '" << var << "' is not found"); + } + it->second->GetCounters(counters_vec); + } + AddMetric(name, counters_vec, expr_obj); + } + } + + auto cur = it++; + metrics_list.erase(cur); + if (--it_number == 0) { + it_number = metrics_list.size(); + if (it_number < metrics_number) { + metrics_number = it_number; + } else if (it_number == metrics_number) { + do_lookup = false; + } else { + EXC_RAISING(HSA_STATUS_ERROR, "Internal error"); } - AddMetric(name, counters_vec, expr_obj); } } } diff --git a/src/core/tracker.h b/src/core/tracker.h index ffc06b85..c4d619c9 100644 --- a/src/core/tracker.h +++ b/src/core/tracker.h @@ -83,7 +83,7 @@ class Tracker { static void Destroy() { std::lock_guard lck(glob_mutex_); - if (instance_ != NULL) delete instance_; + if (instance_ != NULL) delete instance_.load(); instance_ = NULL; } diff --git a/src/util/hsa_rsrc_factory.h b/src/util/hsa_rsrc_factory.h index f982ddde..af031895 100644 --- a/src/util/hsa_rsrc_factory.h +++ b/src/util/hsa_rsrc_factory.h @@ -226,7 +226,7 @@ class HsaRsrcFactory { static void Destroy() { std::lock_guard lck(mutex_); - if (instance_) delete instance_; + if (instance_) delete instance_.load(); instance_ = NULL; } diff --git a/src/util/logger.h b/src/util/logger.h index 527589f6..8c9cbfd3 100644 --- a/src/util/logger.h +++ b/src/util/logger.h @@ -90,7 +90,7 @@ class Logger { static void Destroy() { std::lock_guard lck(mutex_); - if (instance_ != NULL) delete instance_; + if (instance_ != NULL) delete instance_.load(); instance_ = NULL; } diff --git a/src/xml/xml.h b/src/xml/xml.h index 31ed100b..608f3c54 100644 --- a/src/xml/xml.h +++ b/src/xml/xml.h @@ -43,7 +43,10 @@ class Xml { typedef std::vector token_t; struct level_t; - typedef std::vector nodes_t; + typedef std::vector node_vect_t; + typedef std::list node_list_t; + + typedef node_vect_t nodes_t; typedef std::map opts_t; struct level_t { std::string tag; @@ -143,6 +146,7 @@ class Xml { struct print_func { bool fun(const std::string& global_tag, level_t* node) { + std::cout << global_tag << ":" << std::endl; for (auto& opt : node->opts) { std::cout << global_tag << "." << opt.first << " = " << opt.second << std::endl; } @@ -216,14 +220,11 @@ class Xml { if (strncmp(buf, "#include \"", 10) == 0) { for (ind = 0; (ind < size) && (buf[ind] != '\n'); ++ind) {} - if (ind == size) { - fprintf(stderr, "XML PreProcess failed, line size limit %zu\n", kBufSize); - error = true; - break; + if (ind < size) { + buf[ind] = '\0'; + size = ind; + lseek(fd_, pos + ind + 1, SEEK_SET); } - buf[ind] = '\0'; - size = ind; - lseek(fd_, pos + ind + 1, SEEK_SET); for (ind = 10; (ind < size) && (buf[ind] != '"'); ++ind) {} if (ind == size) { @@ -291,6 +292,8 @@ class Xml { if (node_begin) { AddLevel(tag); } else { + Inherit(GetOption("base")); + if (strncmp(CurrentLevel().c_str(), tag, strlen(tag)) != 0) { token.back() = '>'; BadFormat(token); @@ -315,11 +318,7 @@ class Xml { token[j] = '\0'; const std::string key = &token[0]; const std::string value = &token[j + 1]; - if (key == "base") { - Inherit(value); - } else { - AddOption(key, value); - } + AddOption(key, value); } break; default: @@ -437,15 +436,16 @@ class Xml { if (level == NULL) { AddLevel(from->tag); level = level_; - level->copy = from; } + level->copy = from; level->opts = from->opts; for (auto node : from->nodes) { bool found = false; + const std::string name = GetOption("name", node); const std::string global_tag = GlobalTag(level->tag) + "." + node->tag; for (auto item : (*map_)[global_tag]) { - if (node == item->copy) { + if ((name == GetOption("name", item)) || (node == item->copy)) { found = true; break; } @@ -457,14 +457,16 @@ class Xml { } void Inherit(const std::string& tag) { - std::string global_tag = GlobalTag(tag); - auto it = map_->find(global_tag); - if (it == map_->end()) { - fprintf(stderr, "Node \"%s\": Base not found \"%s\"\n", level_->tag.c_str(), tag.c_str()); - abort(); - } - for (auto node : it->second) { - Copy(node, level_); + if (!tag.empty()) { + const std::string global_tag = GlobalTag(tag); + auto it = map_->find(global_tag); + if (it == map_->end()) { + fprintf(stderr, "Node \"%s\": Base not found \"%s\"\n", level_->tag.c_str(), tag.c_str()); + abort(); + } + for (auto node : it->second) { + Copy(node, level_); + } } } @@ -479,7 +481,14 @@ class Xml { return global_tag; } - void AddOption(const std::string& key, const std::string& value) { level_->opts[key] = value; } + void AddOption(const std::string& key, const std::string& value) { + level_->opts[key] = value; + } + std::string GetOption(const std::string& key, const level_t* level = NULL) { + level = (level != NULL) ? level : level_; + auto it = level->opts.find(key); + return (it != level->opts.end()) ? it->second : ""; + } const std::string file_name_; unsigned file_line_; diff --git a/test/app/test.cpp b/test/app/test.cpp index 796ba1eb..54067973 100644 --- a/test/app/test.cpp +++ b/test/app/test.cpp @@ -73,7 +73,7 @@ int main(int argc, char** argv) { TestHsa::HsaInstantiate(); - std::thread t[thrs]; + std::vector t(thrs); for (int n = 0; n < thrs; ++n) { t[n] = std::thread(thread_fun, kiter, diter, agents_number); } diff --git a/test/ctrl/test_hsa.cpp b/test/ctrl/test_hsa.cpp index d006d19c..3cb5dee7 100644 --- a/test/ctrl/test_hsa.cpp +++ b/test/ctrl/test_hsa.cpp @@ -62,7 +62,7 @@ bool TestHsa::Initialize(int /*arg_cnt*/, char** /*arg_list*/) { if (!hsa_rsrc_->GetGpuAgentInfo(agent_id, &agent_info_)) { agent_info_ = NULL; std::cerr << "> error: agent[" << agent_id << "] is not found" << std::endl; - return NULL; + return false; } } std::clog << "> Using agent[" << agent_info_->dev_index << "] : " << agent_info_->name << std::endl; diff --git a/test/run.sh b/test/run.sh index be59b128..4ba2110a 100755 --- a/test/run.sh +++ b/test/run.sh @@ -30,6 +30,7 @@ fi # test check routin test_status=0 +test_runnum=0 test_number=0 xeval_test() { test_number=$test_number @@ -39,6 +40,7 @@ eval_test() { cmdline=$2 if [ $test_filter = -1 -o $test_filter = $test_number ] ; then echo "$label: \"$cmdline\"" + test_runnum=$((test_runnum + 1)) eval "$cmdline" if [ $? != 0 ] ; then echo "$label: FAILED" @@ -122,5 +124,5 @@ eval_test "libtool test, counter sets" ./test/ctrl #valgrind --tool=massif $tbin #ms_print massif.out. -echo "$test_number tests total / $test_status tests failed" +echo "$test_number tests total / $test_runnum tests run / $test_status tests failed" exit $test_status diff --git a/test/tool/gfx_metrics.xml b/test/tool/gfx_metrics.xml index fecfe7b9..698826c6 100644 --- a/test/tool/gfx_metrics.xml +++ b/test/tool/gfx_metrics.xml @@ -67,3 +67,37 @@ + + + # EA1 + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/test/tool/metrics.xml b/test/tool/metrics.xml index 0b53b72e..c340a439 100644 --- a/test/tool/metrics.xml +++ b/test/tool/metrics.xml @@ -1,12 +1,12 @@ #include "gfx_metrics.xml" - + - + @@ -15,15 +15,16 @@ - + + - + - + @@ -34,7 +35,42 @@ - + + + + + # EA1 + + + + + + + + + + # both EA0 and EA1 should be included + + + + + + + + + + + + + + + + + +# VG20 + +# MI100 + # GPUBusy The percentage of time GPU was busy. @@ -149,6 +185,13 @@ expr=WRITE_SIZE > + # MemWrites32B The total number of effective 32B write transactions to the memory + + # L2CacheHit The percentage of fetch, write, atomic, and other instructions that hit the data in L2 cache. Value range: 0% (no hit) to 100% (optimal). lck(mutex_); - if (instance_) delete instance_; + if (instance_) delete instance_.load(); instance_ = NULL; } From 8fc0d419e07b851c016e69880958f08fea24e7f7 Mon Sep 17 00:00:00 2001 From: eshcherb <33529668+eshcherb@users.noreply.github.com> Date: Fri, 18 Oct 2019 20:05:16 -0500 Subject: [PATCH 062/153] Create rocprof.md --- doc/rocprof.md | 1 + 1 file changed, 1 insertion(+) create mode 100644 doc/rocprof.md diff --git a/doc/rocprof.md b/doc/rocprof.md new file mode 100644 index 00000000..b4b7a848 --- /dev/null +++ b/doc/rocprof.md @@ -0,0 +1 @@ +# rocprof From 4adacd14f0c8b031315038321a0274b908cc912c Mon Sep 17 00:00:00 2001 From: eshcherb <33529668+eshcherb@users.noreply.github.com> Date: Fri, 18 Oct 2019 20:51:27 -0500 Subject: [PATCH 063/153] Update rocprof.md --- doc/rocprof.md | 382 +++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 382 insertions(+) diff --git a/doc/rocprof.md b/doc/rocprof.md index b4b7a848..648648fb 100644 --- a/doc/rocprof.md +++ b/doc/rocprof.md @@ -1 +1,383 @@ # rocprof +## 1. Overview +The rocProf is a command line tool implemented on the top of rocProfiler and rocTracer APIs. Source code for rocProf may be found here: +GitHub: https://github.com/ROCm-Developer-Tools/rocprofiler/blob/amd-master/bin/rocprof +This command line tool is implemented as a script which is setting up the environment for attaching the profiler and then run the provided application command line. The tool uses two profiling plugins loaded by ROC runtime and based on rocProfiler and rocTracer for collecting metrics/counters, HW traces and runtime API/activity traces. The tool consumes an input XML or text file with counters list or trace parameters and provides output profiling data and statistics in various formats as text, CSV and JSON traces. Google Chrome tracing can be used to visualize the JSON traces with runtime API/activity timelines and per kernel counters data. +## 2. Profiling Modes +‘rocprof’ can be used for GPU profiling using HW counters and application tracing +### 2.1. GPU profiling +GPU profiling is controlled with input file which defines a list of metrics/counters and a profiling scope. An input file is provided using option ‘-i ’. Output CSV file with a line per submitted kernel is generated. Each line has kernel name, kernel parameters and counter values. By option ‘—stats’ the kernel execution stats can be generated in CSV format. Currently profiling has limitation of serializing submitted kernels. +An example of input file: +``` + # Perf counters group 1 + pmc : Wavefronts VALUInsts SALUInsts SFetchInsts + # Perf counters group 2 + pmc : TCC_HIT[0], TCC_MISS[0] + # Filter by dispatches range, GPU index and kernel names + # supported range formats: "3:9", "3:", "3" + range: 1 : 4 + gpu: 0 1 2 3 + kernel: simple Pass1 simpleConvolutionPass2 +``` +An example of profiling command line for ‘MatrixTranspose’ application +``` +$ rocprof -i input.txt MatrixTranspose +RPL: on '191018_011134' from '/…./rocprofiler_pkg' in '/…./MatrixTranspose' +RPL: profiling '"./MatrixTranspose"' +RPL: input file 'input.txt' +RPL: output dir '/tmp/rpl_data_191018_011134_9695' +RPL: result dir '/tmp/rpl_data_191018_011134_9695/input0_results_191018_011134' +ROCProfiler: rc-file '/…./rpl_rc.xml' +ROCProfiler: input from "/tmp/rpl_data_191018_011134_9695/input0.xml" + gpu_index = + kernel = + range = + 4 metrics + L2CacheHit, VFetchInsts, VWriteInsts, MemUnitStalled + 0 traces +Device name Ellesmere [Radeon RX 470/480/570/570X/580/580X] +PASSED! + +ROCPRofiler: 1 contexts collected, output directory /tmp/rpl_data_191018_011134_9695/input0_results_191018_011134 +RPL: '/…./MatrixTranspose/input.csv' is generated +``` +#### 2.1.1. Counters and metrics +There are two profiling features, metrics and traces. Hardware performance counters are treated as the basic metrics and the formulas can be defined for derived metrics. +Counters and metrics can be dynamically configured using XML configuration files with counters and metrics tables: + - Counters table entry, basic metric: counter name, block name, event id + - Derived metrics table entry: metric name, an expression for calculation the metric from the counters + +Metrics XML File Example: +``` + + + + . . . + + + + . . . + + + + + +``` +##### 2.1.1.1. Metrics query +Available counters and metrics can be queried by options ‘—list-basic’ for counters and ‘—list-derived’ for derived metrics. The output for counters indicates number of block instances and number of block counter registers. The output for derived metrics prints the metrics expressions. +Examples: +``` +$ rocprof --list-basic +RPL: on '191018_014450' from '/opt/rocm/rocprofiler' in '/…./MatrixTranspose' +ROCProfiler: rc-file '/…./rpl_rc.xml' +Basic HW counters: + gpu-agent0 : GRBM_COUNT : Tie High - Count Number of Clocks + block GRBM has 2 counters + gpu-agent0 : GRBM_GUI_ACTIVE : The GUI is Active + block GRBM has 2 counters + . . . + gpu-agent0 : TCC_HIT[0-15] : Number of cache hits. + block TCC has 4 counters + gpu-agent0 : TCC_MISS[0-15] : Number of cache misses. UC reads count as misses. + block TCC has 4 counters + . . . + +$ rocprof --list-derived +RPL: on '191018_015911' from '/opt/rocm/rocprofiler' in '/home/evgeny/work/BUILD/0_MatrixTranspose' +ROCProfiler: rc-file '/home/evgeny/rpl_rc.xml' +Derived metrics: + gpu-agent0 : TCC_HIT_sum : Number of cache hits. Sum over TCC instances. + TCC_HIT_sum = sum(TCC_HIT,16) + gpu-agent0 : TCC_MISS_sum : Number of cache misses. Sum over TCC instances. + TCC_MISS_sum = sum(TCC_MISS,16) + gpu-agent0 : TCC_MC_RDREQ_sum : Number of 32-byte reads. Sum over TCC instaces. + TCC_MC_RDREQ_sum = sum(TCC_MC_RDREQ,16) + . . . +``` +##### 2.1.1.2. Metrics collecting +Counters and metrics accumulated per kernel can be collected using input file with a list of metrics, see an example in 2.1. +Currently profiling has limitation of serializing submitted kernels. +The number of counters which can be dumped by one run is limited by GPU HW by number of counter registers per block. The number of counters can be different for different blocks and can be queried, see 2.1.1.1. +###### 2.1.1.2.1. Blocks instancing +GPU blocks are implemented as several identical instances. To dump counters of specific instance square brackets can be used, see an example in 2.1. +The number of block instances can be queried, see 2.1.1.1. +###### 2.1.1.2.2. HW limitations +The number of counters which can be dumped by one run is limited by GPU HW by number of counter registers per block. The number of counters can be different for different blocks and can be queried, see 2.1.1.1. + - Metrics groups + +To dump a list of metrics exceeding HW limitations the metrics list can be split on groups. +The tool supports automatic splitting on optimal metric groups: +``` +$ rocprof -i input.txt ./MatrixTranspose +RPL: on '191018_032645' from '/opt/rocm/rocprofiler' in '/…./MatrixTranspose' +RPL: profiling './MatrixTranspose' +RPL: input file 'input.txt' +RPL: output dir '/tmp/rpl_data_191018_032645_12106' +RPL: result dir '/tmp/rpl_data_191018_032645_12106/input0_results_191018_032645' +ROCProfiler: rc-file '/…./rpl_rc.xml' +ROCProfiler: input from "/tmp/rpl_data_191018_032645_12106/input0.xml" + gpu_index = + kernel = + range = + 20 metrics + Wavefronts, VALUInsts, SALUInsts, SFetchInsts, FlatVMemInsts, LDSInsts, FlatLDSInsts, GDSInsts, VALUUtilization, FetchSize, WriteSize, L2CacheHit, VWriteInsts, GPUBusy, VALUBusy, SALUBusy, MemUnitStalled, WriteUnitStalled, LDSBankConflict, MemUnitBusy + 0 traces +Device name Ellesmere [Radeon RX 470/480/570/570X/580/580X] + +Input metrics out of HW limit. Proposed metrics group set: + group1: L2CacheHit VWriteInsts MemUnitStalled WriteUnitStalled MemUnitBusy FetchSize FlatVMemInsts LDSInsts VALUInsts SALUInsts SFetchInsts FlatLDSInsts GPUBusy Wavefronts + group2: WriteSize GDSInsts VALUUtilization VALUBusy SALUBusy LDSBankConflict + +ERROR: rocprofiler_open(), Construct(), Metrics list exceeds HW limits + +Aborted (core dumped) +Error found, profiling aborted. +``` + - Collecting with multiple runs + +To collect several metric groups a full application replay is used by defining several ‘pmc:’ lines in the input file, see 2.1. + +### 2.2. Application tracing +Supported application tracing includes runtime API and GPU activity tracing’ +Supported runtimes are: ROCr (HSA API) and HIP +Supported GPU activity: kernel execution, async memory copy, barrier packets. +The trace is generated in JSON format compatible with Chrome tracing. +The trace consists of several sections with timelines for API trace per thread and GPU activity. The timelines events show event name and parameters. +Supported options: ‘—hsa-trace’, ‘—hip-trace’, ‘—sys-trace’, where ‘sys trace’ is for HIP and HSA combined trace. +#### 2.2.1. HIP runtime trace +The trace is generated by option ‘—hip-trace’ and includes HIP API timelines and GPU activity at the runtime level. +#### 2.2.2. ROCr runtime trace +The trace is generated by option ‘—hsa-trace’ and includes ROCr API timelines and GPU activity at AQL queue level. Also, can provide counters per kernel. +#### 2.2.3. KFD driver trace +Is planned to include Thunk API trace and memory allocations/migration tracing. +#### 2.2.4. Code annotation +Support for application code annotation. +Start/stop API is supported to programmatically control the profiling. +A ‘roctx’ library provides annotation API. Annotation is visualized in JSON trace as a separate "Markers and Ranges" timeline section. +##### 2.2.4.1. Start/stop API +``` +// Tracing start API +void roctracer_start(); + +// Tracing stop API +void roctracer_stop(); +``` +##### 2.2.4.2. rocTX basic markers API +``` +// A marker created by given ASCII massage +void roctxMark(const char* message); + +// Returns the 0 based level of a nested range being started by given message associated to this range. +// A negative value is returned on the error. +int roctxRangePush(const char* message); + +// Marks the end of a nested range. +// Returns the 0 based level the range. +// A negative value is returned on the error. +int roctxRangePop(); +``` +### 2.3. Multiple GPUs profiling +The profiler supports multiple GPU’s profiling and provide GPI id for counters and kernels data in CSV output file. Also, GPU id is indicating for respective GPU activity timeline in JSON trace. +## 3. Profiling control +Profiling can be controlled by specifying a profiling scope, by filtering trace events and specifying interesting time intervals. +### 3.1. Profiling scope +Counters profiling scope can be specified by GPU id list, kernel name substrings list and dispatch range. +Supported range formats examples: "3:9", "3:", "3". You can see an example of input file in 2.1. +#### 3.2. Tracing control +Tracing can be filtered by events names using profiler input file and by enabling interesting time intervals by command line option. +#### 3.2.1. Filtering traced APIs +A list of traced API names can be specified in profiler input file. +An example of input file line for ROCr runtime trace (HAS API): +hsa: hsa_queue_create hsa_amd_memory_pool_allocate +#### 3.2.2. Tracing time period +Trace can be dumped periodically with initial delay, dumping period length and rate: +``` +--trace-period +``` +### 3.3. Concurrent kernels +Currently concurrent kernels profiling is not supported which is a planned feature. Kernels are serialized. +### 3.4. Multi-processes profiling +Multi-processes profiling is not currently supported. +### 3.5. Errors logging +Profiler errors are logged to global logs: +``` +/tmp/aql_profile_log.txt +/tmp/rocprofiler_log.txt +/tmp/roctracer_log.txt +``` +## 4. 3rd party visualization tools +‘rocprof’ is producing JSON trace compatible with Chrome Tracing, which is an internal trace visualization tool in Google Chrome. +### 4.1. Chrome tracing +Good review can be found by the link: https://aras-p.info/blog/2017/01/23/Chrome-Tracing-as-Profiler-Frontend/ +## 5. Command line options +The command line options can be printed with option ‘-h’: +``` +$ rocprof -h +RPL: on '191018_023018' from '/opt/rocm/rocprofiler' in '/…./MatrixTranspose' +ROCm Profiling Library (RPL) run script, a part of ROCprofiler library package. +Full path: /opt/rocm/rocprofiler/bin/rocprof +Metrics definition: /opt/rocm/rocprofiler/lib/metrics.xml + +Usage: + rocprof [-h] [--list-basic] [--list-derived] [-i ] [-o ] + +Options: + -h - this help + --verbose - verbose mode, dumping all base counters used in the input metrics + --list-basic - to print the list of basic HW counters + --list-derived - to print the list of derived metrics with formulas + + -i <.txt|.xml file> - input file + Input file .txt format, automatically rerun application for every pmc line: + + # Perf counters group 1 + pmc : Wavefronts VALUInsts SALUInsts SFetchInsts FlatVMemInsts LDSInsts FlatLDSInsts GDSInsts VALUUtilization FetchSize + # Perf counters group 2 + pmc : WriteSize L2CacheHit + # Filter by dispatches range, GPU index and kernel names + # supported range formats: "3:9", "3:", "3" + range: 1 : 4 + gpu: 0 1 2 3 + kernel: simple Pass1 simpleConvolutionPass2 + + Input file .xml format, for single profiling run: + + # Metrics list definition, also the form ":" can be used + # All defined metrics can be found in the 'metrics.xml' + # There are basic metrics for raw HW counters and high-level metrics for derived counters + + + # Filter by dispatches range, GPU index and kernel names + + + -o - output CSV file [.csv] + -d - directory where profiler store profiling data including traces [/tmp] + The data directory is renoving autonatically if the directory is matching the temporary one, which is the default. + -t - to change the temporary directory [/tmp] + By changing the temporary directory you can prevent removing the profiling data from /tmp or enable removing from not '/tmp' directory. + + --basenames - to turn on/off truncating of the kernel full function names till the base ones [off] + --timestamp - to turn on/off the kernel disoatches timestamps, dispatch/begin/end/complete [off] + --ctx-wait - to wait for outstanding contexts on profiler exit [on] + --ctx-limit - maximum number of outstanding contexts [0 - unlimited] + --heartbeat - to print progress heartbeats [0 - disabled] + + --stats - generating kernel execution stats, file .stats.csv + --hsa-trace - to trace HSA, generates API execution stats and JSON file chrome-tracing compatible + --hip-trace - to trace HIP, generates API execution stats and JSON file chrome-tracing compatible + --sys-trace - to trace HIP/HSA APIs and GPU activity, generates stats and JSON trace chrome-tracing compatible + Generated files: .hsa_stats.txt .json + Traced API list can be set by input .txt or .xml files. + Input .txt: + hsa: hsa_queue_create hsa_amd_memory_pool_allocate + Input .xml: + + + + + + --trace-period - to enable trace with initial delay, with periodic sample length and rate + Supported time formats: + +Configuration file: + You can set your parameters defaults preferences in the configuration file 'rpl_rc.xml'. The search path sequence: .:/home/evgeny: + First the configuration file is looking in the current directory, then in your home, and then in the package directory. + Configurable options: 'basenames', 'timestamp', 'ctx-limit', 'heartbeat'. + An example of 'rpl_rc.xml': + +``` +## 6. Publicly available counters and metrics +The following counters are publicly available for commercially available VEGA10/20 GPUs. + +Counters: +``` +• GRBM_COUNT : Tie High - Count Number of Clocks +• GRBM_GUI_ACTIVE : The GUI is Active +• SQ_WAVES : Count number of waves sent to SQs. (per-simd, emulated, global) +• SQ_INSTS_VALU : Number of VALU instructions issued. (per-simd, emulated) +• SQ_INSTS_VMEM_WR : Number of VMEM write instructions issued (including FLAT). (per-simd, emulated) +• SQ_INSTS_VMEM_RD : Number of VMEM read instructions issued (including FLAT). (per-simd, emulated) +• SQ_INSTS_SALU : Number of SALU instructions issued. (per-simd, emulated) +• SQ_INSTS_SMEM : Number of SMEM instructions issued. (per-simd, emulated) +• SQ_INSTS_FLAT : Number of FLAT instructions issued. (per-simd, emulated) +• SQ_INSTS_FLAT_LDS_ONLY : Number of FLAT instructions issued that read/wrote only from/to LDS (only works if EARLY_TA_DONE is enabled). (per-simd, emulated) +• SQ_INSTS_LDS : Number of LDS instructions issued (including FLAT). (per-simd, emulated) +• SQ_INSTS_GDS : Number of GDS instructions issued. (per-simd, emulated) +• SQ_WAIT_INST_LDS : Number of wave-cycles spent waiting for LDS instruction issue. In units of 4 cycles. (per-simd, nondeterministic) +• SQ_ACTIVE_INST_VALU : regspec 71? Number of cycles the SQ instruction arbiter is working on a VALU instruction. (per-simd, nondeterministic) +• SQ_INST_CYCLES_SALU : Number of cycles needed to execute non-memory read scalar operations. (per-simd, emulated) +• SQ_THREAD_CYCLES_VALU : Number of thread-cycles used to execute VALU operations (similar to INST_CYCLES_VALU but multiplied by # of active threads). (per-simd) +• SQ_LDS_BANK_CONFLICT : Number of cycles LDS is stalled by bank conflicts. (emulated) +• TA_TA_BUSY[0-15] : TA block is busy. Perf_Windowing not supported for this counter. +• TA_FLAT_READ_WAVEFRONTS[0-15] : Number of flat opcode reads processed by the TA. +• TA_FLAT_WRITE_WAVEFRONTS[0-15] : Number of flat opcode writes processed by the TA. +• TCC_HIT[0-15] : Number of cache hits. +• TCC_MISS[0-15] : Number of cache misses. UC reads count as misses. +• TCC_EA_WRREQ[0-15] : Number of transactions (either 32-byte or 64-byte) going over the TC_EA_wrreq interface. Atomics may travel over the same interface and are generally classified as write requests. This does not include probe commands. +• TCC_EA_WRREQ_64B[0-15] : Number of 64-byte transactions going (64-byte write or CMPSWAP) over the TC_EA_wrreq interface. +• TCC_EA_WRREQ_STALL[0-15] : Number of cycles a write request was stalled. +• TCC_EA_RDREQ[0-15] : Number of TCC/EA read requests (either 32-byte or 64-byte) +• TCC_EA_RDREQ_32B[0-15] : Number of 32-byte TCC/EA read requests +• TCP_TCP_TA_DATA_STALL_CYCLES[0-15] : TCP stalls TA data interface. Now Windowed. +``` + +The following derived metrics have been defined and the profiler metrics XML specification can be found at: https://github.com/ROCm-Developer-Tools/rocprofiler/blob/amd-master/test/tool/metrics.xml. + +Metrics: +``` +• TA_BUSY_avr : TA block is busy. Average over TA instances. +• TA_BUSY_max : TA block is busy. Max over TA instances. +• TA_BUSY_min : TA block is busy. Min over TA instances. +• TA_FLAT_READ_WAVEFRONTS_sum : Number of flat opcode reads processed by the TA. Sum over TA instances. +• TA_FLAT_WRITE_WAVEFRONTS_sum : Number of flat opcode writes processed by the TA. Sum over TA instances. +• TCC_HIT_sum : Number of cache hits. Sum over TCC instances. +• TCC_MISS_sum : Number of cache misses. Sum over TCC instances. +• TCC_EA_RDREQ_32B_sum : Number of 32-byte TCC/EA read requests. Sum over TCC instances. +• TCC_EA_RDREQ_sum : Number of TCC/EA read requests (either 32-byte or 64-byte). Sum over TCC instances. +• TCC_EA_WRREQ_sum : Number of transactions (either 32-byte or 64-byte) going over the TC_EA_wrreq interface. Sum over TCC instances. +• TCC_EA_WRREQ_64B_sum : Number of 64-byte transactions going (64-byte write or CMPSWAP) over the TC_EA_wrreq interface. Sum over TCC instances. +• TCC_WRREQ_STALL_max : Number of cycles a write request was stalled. Max over TCC instances. +• TCC_MC_WRREQ_sum : Number of 32-byte effective writes. Sum over TCC instaces. +• FETCH_SIZE : The total kilobytes fetched from the video memory. This is measured with all extra fetches and any cache or memory effects taken into account. +• WRITE_SIZE : The total kilobytes written to the video memory. This is measured with all extra fetches and any cache or memory effects taken into account. +• GPUBusy : The percentage of time GPU was busy. +• Wavefronts : Total wavefronts. +• VALUInsts : The average number of vector ALU instructions executed per work-item (affected by flow control). +• SALUInsts : The average number of scalar ALU instructions executed per work-item (affected by flow control). +• VFetchInsts : The average number of vector fetch instructions from the video memory executed per work-item (affected by flow control). Excludes FLAT instructions that fetch from video memory. +• SFetchInsts : The average number of scalar fetch instructions from the video memory executed per work-item (affected by flow control). +• VWriteInsts : The average number of vector write instructions to the video memory executed per work-item (affected by flow control). Excludes FLAT instructions that write to video memory. +• FlatVMemInsts : The average number of FLAT instructions that read from or write to the video memory executed per work item (affected by flow control). Includes FLAT instructions that read from or write to scratch. +• LDSInsts : The average number of LDS read or LDS write instructions executed per work item (affected by flow control). Excludes FLAT instructions that read from or write to LDS. +• FlatLDSInsts : The average number of FLAT instructions that read or write to LDS executed per work item (affected by flow control). +• GDSInsts : The average number of GDS read or GDS write instructions executed per work item (affected by flow control). +• VALUUtilization : The percentage of active vector ALU threads in a wave. A lower number can mean either more thread divergence in a wave or that the work-group size is not a multiple of 64. Value range: 0% (bad), 100% (ideal - no thread divergence). +• VALUBusy : The percentage of GPUTime vector ALU instructions are processed. Value range: 0% (bad) to 100% (optimal). +• SALUBusy : The percentage of GPUTime scalar ALU instructions are processed. Value range: 0% (bad) to 100% (optimal). +• Mem32Bwrites : +• FetchSize : The total kilobytes fetched from the video memory. This is measured with all extra fetches and any cache or memory effects taken into account. +• WriteSize : The total kilobytes written to the video memory. This is measured with all extra fetches and any cache or memory effects taken into account. +• L2CacheHit : The percentage of fetch, write, atomic, and other instructions that hit the data in L2 cache. Value range: 0% (no hit) to 100% (optimal). +• MemUnitBusy : The percentage of GPUTime the memory unit is active. The result includes the stall time (MemUnitStalled). This is measured with all extra fetches and writes and any cache or memory effects taken into account. Value range: 0% to 100% (fetch-bound). +• MemUnitStalled : The percentage of GPUTime the memory unit is stalled. Try reducing the number or size of fetches and writes if possible. Value range: 0% (optimal) to 100% (bad). +• WriteUnitStalled : The percentage of GPUTime the Write unit is stalled. Value range: 0% to 100% (bad). +• ALUStalledByLDS : The percentage of GPUTime ALU units are stalled by the LDS input queue being full or the output queue being not ready. If there are LDS bank conflicts, reduce them. Otherwise, try reducing the number of LDS accesses if possible. Value range: 0% (optimal) to 100% (bad). +• LDSBankConflict : The percentage of GPUTime LDS is stalled by bank conflicts. Value range: 0% (optimal) to 100% (bad). +``` From b7a108677ec367c31ea14d8c43423e67ad7c4925 Mon Sep 17 00:00:00 2001 From: eshcherb <33529668+eshcherb@users.noreply.github.com> Date: Fri, 22 Nov 2019 17:57:59 -0600 Subject: [PATCH 064/153] Update README.md --- README.md | 3 +++ 1 file changed, 3 insertions(+) diff --git a/README.md b/README.md index 82f525dc..b8f26bd2 100644 --- a/README.md +++ b/README.md @@ -140,6 +140,9 @@ Options: --heartbeat - to print progress heartbeats [0 - disabled] --stats - generating kernel executino stats, file .stats.csv + --roctx-trace - to enable rocTX trace: + Will show the application code instrumentation rocTX events: roctxMark, roctxRangePush, roctxRangePop in JSON trace + "Markers and Ranges" section. Application code needs to be explicitely instrumented with rocTX events. --hip-trace - to trace HIP, generates API execution stats/trace and JSON file viewable in chrome tracing 'HCC_HOME' env var is required to be set to where 'hcc' is installed. --hsa-trace - to trace HSA, generates API execution stats/trace and JSON file viewable in chrome tracing From 908ea02e256818f27065777c44c136003abb07a8 Mon Sep 17 00:00:00 2001 From: eshcherb <33529668+eshcherb@users.noreply.github.com> Date: Fri, 22 Nov 2019 18:47:13 -0600 Subject: [PATCH 065/153] Update README.md --- README.md | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index b8f26bd2..3eabe823 100644 --- a/README.md +++ b/README.md @@ -140,9 +140,6 @@ Options: --heartbeat - to print progress heartbeats [0 - disabled] --stats - generating kernel executino stats, file .stats.csv - --roctx-trace - to enable rocTX trace: - Will show the application code instrumentation rocTX events: roctxMark, roctxRangePush, roctxRangePop in JSON trace - "Markers and Ranges" section. Application code needs to be explicitely instrumented with rocTX events. --hip-trace - to trace HIP, generates API execution stats/trace and JSON file viewable in chrome tracing 'HCC_HOME' env var is required to be set to where 'hcc' is installed. --hsa-trace - to trace HSA, generates API execution stats/trace and JSON file viewable in chrome tracing @@ -155,6 +152,11 @@ Options: + + --roctx-trace - to enable rocTX applicatin code annotation trace; should be use in addition to the trace optins above. + Will show the application code annotation with rocTX events: roctxMark, roctxRangePush, roctxRangePop in JSON trace + "Markers and Ranges" section. + Application code needs to be explicitely instrumented using rocTX events APIs. Configuration file: You can set your parameters defaults preferences in the configuration file 'rpl_rc.xml'. The search path sequence: .:/home/evgeny: From b49880d25093ae3c1bf52296eff2d03e510de011 Mon Sep 17 00:00:00 2001 From: eshcherb <33529668+eshcherb@users.noreply.github.com> Date: Fri, 22 Nov 2019 18:48:09 -0600 Subject: [PATCH 066/153] Update README.md --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index 3eabe823..e1bc47f3 100644 --- a/README.md +++ b/README.md @@ -157,6 +157,7 @@ Options: Will show the application code annotation with rocTX events: roctxMark, roctxRangePush, roctxRangePop in JSON trace "Markers and Ranges" section. Application code needs to be explicitely instrumented using rocTX events APIs. + See roctracer documentation on rocTX API details. Configuration file: You can set your parameters defaults preferences in the configuration file 'rpl_rc.xml'. The search path sequence: .:/home/evgeny: From 54aaa3da88fff0f7fc336e3730f15689afd47a73 Mon Sep 17 00:00:00 2001 From: eshcherb <33529668+eshcherb@users.noreply.github.com> Date: Fri, 22 Nov 2019 18:48:48 -0600 Subject: [PATCH 067/153] Update README.md --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index e1bc47f3..2f63df0b 100644 --- a/README.md +++ b/README.md @@ -153,7 +153,7 @@ Options: - --roctx-trace - to enable rocTX applicatin code annotation trace; should be use in addition to the trace optins above. + --roctx-trace - to enable rocTX applicatin code annotation trace; should be use in addition to the trace options above. Will show the application code annotation with rocTX events: roctxMark, roctxRangePush, roctxRangePop in JSON trace "Markers and Ranges" section. Application code needs to be explicitely instrumented using rocTX events APIs. From 4e0158a8efeb4de4df7666ae306dcf56f3eb5c15 Mon Sep 17 00:00:00 2001 From: Evgeny Date: Sat, 23 Nov 2019 00:25:30 -0600 Subject: [PATCH 068/153] rocm2.10 update --- bin/dform.py | 5 +++ bin/rpl_run.sh | 56 +++++++++++++++++++-------- bin/tblextr.py | 83 ++++++++++++++++++++++++++++++++++++---- src/core/metrics.h | 14 ++++++- src/core/rocprofiler.cpp | 4 +- 5 files changed, 133 insertions(+), 29 deletions(-) diff --git a/bin/dform.py b/bin/dform.py index 1e5c63b1..f797d637 100644 --- a/bin/dform.py +++ b/bin/dform.py @@ -24,6 +24,11 @@ def gen_api_json_trace(db, table, start_us, outfile): db.dump_json('B', table, outfile) db.execute('DROP VIEW B') +def gen_ext_json_trace(db, table, start_us, outfile): + db.execute('create view B as select Name as name, pid, tid, (BeginNs/1000 - %d) as ts, ((EndNs - BeginNs)/1000) as dur from %s order by ts asc;' % (start_us, table)); + db.dump_json('B', table, outfile) + db.execute('DROP VIEW B') + def gen_ops_json_trace(db, table, base_pid, start_us, outfile): db.execute('create view B as select "Index", Name as name, ("dev-id" + %d) as pid, tid, (BeginNs/1000 - %d) as ts, (DurationNs/1000) as dur from %s order by ts asc;' % (base_pid, start_us, table)); db.dump_json('B', table, outfile) diff --git a/bin/rpl_run.sh b/bin/rpl_run.sh index 2e0ba8ba..a6299b66 100755 --- a/bin/rpl_run.sh +++ b/bin/rpl_run.sh @@ -45,6 +45,7 @@ if [ -z "$HCC_HOME" ] ; then fi # runtime API trace +ROCTX_TRACE=0 HSA_TRACE=0 SYS_TRACE=0 HIP_TRACE=0 @@ -142,7 +143,6 @@ usage() { echo " The data directory is renoving autonatically if the directory is matching the temporary one, which is the default." echo " -t - to change the temporary directory [/tmp]" echo " By changing the temporary directory you can prevent removing the profiling data from /tmp or enable removing from not '/tmp' directory." - echo " -m - file defining custom metrics to use in-place of defaults." echo "" echo " --basenames - to turn on/off truncating of the kernel full function names till the base ones [off]" echo " --timestamp - to turn on/off the kernel disoatches timestamps, dispatch/begin/end/complete [off]" @@ -151,6 +151,7 @@ usage() { echo " --heartbeat - to print progress heartbeats [0 - disabled]" echo "" echo " --stats - generating kernel execution stats, file .stats.csv" + echo " --roctx-trace - to enable rocTX trace" echo " --hsa-trace - to trace HSA, generates API execution stats and JSON file chrome-tracing compatible" echo " --sys-trace - to trace HIP/HSA APIs and GPU activity, generates stats and JSON trace chrome-tracing compatible" echo " --hip-trace - to trace HIP, generates API execution stats and JSON file chrome-tracing compatible" @@ -214,23 +215,22 @@ run() { fi API_TRACE="" - if [ "$HSA_TRACE" = 1 ] ; then - API_TRACE="hsa" + if [ "$ROCTX_TRACE" = 1 ] ; then + API_TRACE=${API_TRACE}":roctx" fi - if [ "$SYS_TRACE" = 1 ] ; then - API_TRACE="sys" + if [ "$HSA_TRACE" = 1 ] ; then + API_TRACE=${API_TRACE}":hsa" fi if [ "$HIP_TRACE" = 1 ] ; then - if [ -z "$API_TRACE" ] ; then - API_TRACE="hip"; - else - API_TRACE="all" - fi + API_TRACE=${API_TRACE}":hip" + fi + if [ "$SYS_TRACE" = 1 ] ; then + API_TRACE=${API_TRACE}":sys" fi + if [ -n "$API_TRACE" ] ; then - API_TRACE=$(echo $API_TRACE | sed 's/all//') - if [ -n "$API_TRACE" ] ; then export ROCTRACER_DOMAIN=$API_TRACE; fi - if [ "$API_TRACE" = "hip" -o "$API_TRACE" = "sys" ] ; then + export ROCTRACER_DOMAIN=$API_TRACE + if [ "$HSA_TRACE" = 0 ] ; then OUTPUT_LIST="$ROCP_OUTPUT_DIR/" fi export HSA_TOOLS_LIB="$TTLIB_PATH/libtracer_tool.so" @@ -241,7 +241,6 @@ run() { redirection_cmd="2>&1 | tee $ROCP_OUTPUT_DIR/log.txt" fi - #unset ROCP_OUTPUT_DIR CMD_LINE="$APP_CMD $redirection_cmd" eval "$CMD_LINE" } @@ -259,6 +258,29 @@ merge_output() { done } +convert_time_val() { + local time_maxumim_us=$((0xffffffff)) + local __resultvar=$1 + eval "local val=$"$__resultvar + val_m=`echo $val | sed -n "s/^\([0-9]*\)m$/\1/p"` + val_s=`echo $val | sed -n "s/^\([0-9]*\)s$/\1/p"` + val_ms=`echo $val | sed -n "s/^\([0-9]*\)ms$/\1/p"` + val_us=`echo $val | sed -n "s/^\([0-9]*\)us$/\1/p"` + if [ -n "$val_m" ] ; then val_us=$((val_m*60000000)) + elif [ -n "$val_s" ] ; then val_us=$((val_s*1000000)) + elif [ -n "$val_ms" ] ; then val_us=$((val_ms*1000)) + fi + + if [ -z "$val_us" ] ; then + error_message="invalid time value format ($val)" + elif [ "$val_us" -gt "$time_maxumim_us" ] ; then + error_message="time value exceeds maximum supported ($val > ${time_maxumim_us}us)" + else + eval $__resultvar="'$val_us'" + fi +} + +################################################################################################ # main echo "RPL: on '$time_stamp' from '$PKG_DIR' in '$RUN_DIR'" # Parsing arguments @@ -289,9 +311,6 @@ while [ 1 ] ; do if [ "$OUTPUT_DIR" = "-" ] ; then DATA_PATH=$TMP_DIR fi - elif [ "$1" = "-m" ] ; then - unset ROCP_METRICS - export ROCP_METRICS="$2" elif [ "$1" = "--list-basic" ] ; then export ROCP_INFO=b eval "$PKG_DIR/tool/ctrl" @@ -326,6 +345,9 @@ while [ 1 ] ; do ARG_VAL=0 export ROCP_TIMESTAMP_ON=1 GEN_STATS=1 + elif [ "$1" = "--roctx-trace" ] ; then + ARG_VAL=0 + ROCTX_TRACE=1 elif [ "$1" = "--hsa-trace" ] ; then ARG_VAL=0 export ROCP_TIMESTAMP_ON=1 diff --git a/bin/tblextr.py b/bin/tblextr.py index 057e984a..895f41b1 100755 --- a/bin/tblextr.py +++ b/bin/tblextr.py @@ -36,7 +36,8 @@ OPS_PID = 1 HSA_PID = 2 HIP_PID = 3 -GPU_BASE_PID = 4 +EXT_PID = 4 +GPU_BASE_PID = 5 max_gpu_id = -1 START_US = 0 @@ -186,10 +187,66 @@ def fill_kernel_db(table_name, db): db.insert_entry(table_handle, val_list) ############################################################# -# fill HSA DB -hsa_table_descr = [ +# Fill Ext DB +ext_table_descr = [ + ['BeginNs', 'EndNs', 'pid', 'tid', 'Name', 'Index'], + {'BeginNs':'INTEGER', 'EndNs':'INTEGER', 'pid':'INTEGER', 'tid':'INTEGER', 'Name':'TEXT', 'Index':'INTEGER'} +] +def fill_ext_db(table_name, db, indir, trace_name, api_pid): + file_name = indir + '/' + trace_name + '_trace.txt' + ptrn_val = re.compile(r'(\d+) (\d+):(\d+) (\d+):(.*)$') + + if not os.path.isfile(file_name): return 0 + + range_stack = {} + + record_id = 0 + table_handle = db.add_table(table_name, ext_table_descr) + with open(file_name, mode='r') as fd: + for line in fd.readlines(): + record = line[:-1] + m = ptrn_val.match(record) + if m: + tms = int(m.group(1)) + pid = m.group(2) + tid = m.group(3) + cid = int(m.group(4)) + msg = m.group(5) + + rec_vals = [] + + if cid != 2: + rec_vals.append(tms) + rec_vals.append(tms + 1) + rec_vals.append(api_pid) + rec_vals.append(tid) + rec_vals.append(msg) + rec_vals.append(record_id) + + if cid == 1: + if not pid in range_stack: range_stack[pid] = {} + pid_stack = range_stack[pid] + if not tid in pid_stack: pid_stack[tid] = [] + rec_stack = pid_stack[tid] + rec_stack.append(rec_vals) + continue + + if cid == 2: + pid_stack = range_stack[pid] + rec_stack = pid_stack[tid] + rec_vals = rec_stack.pop() + rec_vals[1] = tms + + db.insert_entry(table_handle, rec_vals) + record_id += 1 + + return 1 +############################################################# + +# Fill API DB +api_table_descr = [ ['BeginNs', 'EndNs', 'pid', 'tid', 'Name', 'args', 'Index'], - {'Index':'INTEGER', 'Name':'TEXT', 'args':'TEXT', 'BeginNs':'INTEGER', 'EndNs':'INTEGER', 'pid':'INTEGER', 'tid':'INTEGER'} + {'BeginNs':'INTEGER', 'EndNs':'INTEGER', 'pid':'INTEGER', 'tid':'INTEGER', 'Name':'TEXT', 'args':'TEXT', 'Index':'INTEGER'} ] def fill_api_db(table_name, db, indir, api_name, api_pid, dep_pid, dep_list, dep_filtr, expl_id): file_name = indir + '/' + api_name + '_api_trace.txt' @@ -211,14 +268,15 @@ def fill_api_db(table_name, db, indir, api_name, api_pid, dep_pid, dep_list, dep START_US = 0 record_id = 0 - table_handle = db.add_table(table_name, hsa_table_descr) + table_handle = db.add_table(table_name, api_table_descr) with open(file_name, mode='r') as fd: for line in fd.readlines(): record = line[:-1] m = ptrn_val.match(record) if m: rec_vals = [] - for ind in range(1,7): + rec_len = len(api_table_descr[0]) + for ind in range(1,rec_len): rec_vals.append(m.group(ind)) rec_vals[2] = api_pid rec_vals.append(record_id) @@ -332,7 +390,8 @@ def fill_ops_db(table_name, db, indir): outfile = sys.argv[1] infiles = sys.argv[2:] indir = re.sub(r'\/[^\/]*$', r'', infiles[0]) -inext = re.sub(r'^[^\.]*', r'', infiles[0]) +inext = re.sub(r'\s+$', r'', infiles[0]) +inext = re.sub(r'^.*(\.[^\.]+)$', r'\1', inext) dbfile = '' csvfile = '' @@ -358,6 +417,8 @@ def fill_ops_db(table_name, db, indir): with open(dbfile, mode='w') as fd: fd.truncate() db = SQLiteDB(dbfile) + ext_trace_found = fill_ext_db('rocTX', db, indir, 'roctx', EXT_PID) + hsa_trace_found = fill_api_db('HSA', db, indir, 'hsa', HSA_PID, COPY_PID, kern_dep_list, {}, 0) hsa_activity_found = fill_copy_db('COPY', db, indir) @@ -366,10 +427,13 @@ def fill_ops_db(table_name, db, indir): fill_kernel_db('A', db) - any_trace_found = hsa_trace_found | hip_trace_found + any_trace_found = ext_trace_found | hsa_trace_found | hip_trace_found if any_trace_found: db.open_json(jsonfile) + if ext_trace_found: + db.label_json(EXT_PID, "Markers and Ranges", jsonfile) + if hsa_trace_found: db.label_json(HSA_PID, "CPU HSA API", jsonfile) if hsa_activity_found: @@ -382,6 +446,9 @@ def fill_ops_db(table_name, db, indir): for ind in range(0, int(max_gpu_id) + 1): db.label_json(int(ind) + int(GPU_BASE_PID), "GPU" + str(ind), jsonfile) + if ext_trace_found: + dform.gen_ext_json_trace(db, 'rocTX', START_US, jsonfile) + if len(var_table) != 0: dform.post_process_data(db, 'A', csvfile) dform.gen_table_bins(db, 'A', statfile, 'KernelName', 'DurationNs') diff --git a/src/core/metrics.h b/src/core/metrics.h index 547156de..57ec7c31 100644 --- a/src/core/metrics.h +++ b/src/core/metrics.h @@ -171,11 +171,20 @@ class MetricsDict { const_iterator_t Begin() const { return cache_.begin(); } const_iterator_t End() const { return cache_.end(); } + std::string GetAgentName() const { return agent_name_; } + + xml::Xml::nodes_t GetNodes() const { + auto nodes_vec = GetNodes(agent_name_); + auto global_vec = GetNodes("global"); + nodes_vec.insert(nodes_vec.end(), global_vec.begin(), global_vec.end()); + return nodes_vec; + } + + private: xml::Xml::nodes_t GetNodes(const std::string& scope) const { return (xml_ != NULL) ? xml_->GetNodes("top." + scope + ".metric") : xml::Xml::nodes_t(); } - private: MetricsDict(const util::AgentInfo* agent_info) : xml_(NULL), agent_info_(agent_info) { const char* xml_name = getenv("ROCP_METRICS"); if (xml_name != NULL) { @@ -186,11 +195,13 @@ class MetricsDict { xml_->AddConst("top.const.metric", "SIMD_NUM", agent_info->simds_per_cu * agent_info->cu_num); xml_->AddConst("top.const.metric", "SE_NUM", agent_info->se_num); ImportMetrics(agent_info, "const"); + agent_name_ = agent_info->name; if (std::string("gfx906") == agent_info->name) { ImportMetrics(agent_info, agent_info->name); } else if (std::string("gfx908") == agent_info->name) { ImportMetrics(agent_info, agent_info->name); } else { + agent_name_ = agent_info->gfxip; ImportMetrics(agent_info, agent_info->gfxip); } ImportMetrics(agent_info, "global"); @@ -327,6 +338,7 @@ class MetricsDict { xml::Xml* xml_; const util::AgentInfo* agent_info_; + std::string agent_name_; cache_t cache_; static map_t* map_; diff --git a/src/core/rocprofiler.cpp b/src/core/rocprofiler.cpp index 090e5492..cbfbc432 100644 --- a/src/core/rocprofiler.cpp +++ b/src/core/rocprofiler.cpp @@ -738,9 +738,7 @@ PUBLIC_API hsa_status_t rocprofiler_iterate_info( case ROCPROFILER_INFO_KIND_METRIC: { const rocprofiler::MetricsDict* dict = rocprofiler::GetMetrics(agent_info->dev_id); - auto nodes_vec = dict->GetNodes(agent_info->gfxip); - auto global_vec = dict->GetNodes("global"); - nodes_vec.insert(nodes_vec.end(), global_vec.begin(), global_vec.end()); + auto nodes_vec = dict->GetNodes(); for (auto* node : nodes_vec) { const std::string& name = node->opts["name"]; From 8914bf04559ea18401612570cd9df15d9b370d29 Mon Sep 17 00:00:00 2001 From: eshcherb <33529668+eshcherb@users.noreply.github.com> Date: Wed, 27 Nov 2019 17:37:54 -0600 Subject: [PATCH 069/153] Update LICENSE --- LICENSE | 1 + 1 file changed, 1 insertion(+) diff --git a/LICENSE b/LICENSE index 9e78331e..8384c985 100644 --- a/LICENSE +++ b/LICENSE @@ -1,4 +1,5 @@ Copyright (c) 2018 Advanced Micro Devices, Inc. All rights reserved. +[MITx11 License] Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal From 8703877d2373c530c83945ccd96aeda3867ea966 Mon Sep 17 00:00:00 2001 From: Evgeny Date: Tue, 3 Dec 2019 16:47:39 -0600 Subject: [PATCH 070/153] custom metrics file rocprof option; profiled cmd line quoting option; --- bin/rpl_run.sh | 29 +++++++++++++++++++++-------- 1 file changed, 21 insertions(+), 8 deletions(-) diff --git a/bin/rpl_run.sh b/bin/rpl_run.sh index a6299b66..6a7bbf80 100755 --- a/bin/rpl_run.sh +++ b/bin/rpl_run.sh @@ -106,6 +106,7 @@ usage() { echo " --verbose - verbose mode, dumping all base counters used in the input metrics" echo " --list-basic - to print the list of basic HW counters" echo " --list-derived - to print the list of derived metrics with formulas" + echo " --cmd-qts - quoting profiled cmd line [on]" echo "" echo " -i <.txt|.xml file> - input file" echo " Input file .txt format, automatically rerun application for every pmc line:" @@ -143,6 +144,7 @@ usage() { echo " The data directory is renoving autonatically if the directory is matching the temporary one, which is the default." echo " -t - to change the temporary directory [/tmp]" echo " By changing the temporary directory you can prevent removing the profiling data from /tmp or enable removing from not '/tmp' directory." + echo " -m - file defining custom metrics to use in-place of defaults." echo "" echo " --basenames - to turn on/off truncating of the kernel full function names till the base ones [off]" echo " --timestamp - to turn on/off the kernel disoatches timestamps, dispatch/begin/end/complete [off]" @@ -311,6 +313,9 @@ while [ 1 ] ; do if [ "$OUTPUT_DIR" = "-" ] ; then DATA_PATH=$TMP_DIR fi + elif [ "$1" = "-m" ] ; then + unset ROCP_METRICS + export ROCP_METRICS="$2" elif [ "$1" = "--list-basic" ] ; then export ROCP_INFO=b eval "$PKG_DIR/tool/ctrl" @@ -366,6 +371,10 @@ while [ 1 ] ; do elif [ "$1" = "--verbose" ] ; then ARG_VAL=0 export ROCP_VERBOSE_MODE=1 + elif [ "$1" = "--cmd-qts" ] ; then + if [ "$2" = "off" ] ; then + CMD_QTS=0 + fi else break fi @@ -404,14 +413,18 @@ else csv_output=$RUN_DIR/${input_base}.csv fi -APP_CMD="" -for i in `seq 1 $#`; do - if [ -n "$APP_CMD" ] ; then - APP_CMD=$APP_CMD" " - fi - eval "arg=\${$i}" - APP_CMD=$APP_CMD\"$arg\" -done +# Profiled cmd line string +APP_CMD=$* +if [ "$CMD_QTS" = 1 ] ; then + APP_CMD="" + for i in `seq 1 $#`; do + if [ -n "$APP_CMD" ] ; then + APP_CMD=$APP_CMD" " + fi + eval "arg=\${$i}" + APP_CMD=$APP_CMD\"$arg\" + done +fi echo "RPL: profiling '$APP_CMD'" echo "RPL: input file '$INPUT_FILE'" From b4d4c031fd54630a64b49a7f339bff739f77cd0e Mon Sep 17 00:00:00 2001 From: Evgeny Date: Tue, 17 Dec 2019 01:58:30 -0600 Subject: [PATCH 071/153] 3.0 update --- CMakeLists.txt | 2 +- bin/dform.py | 20 ++- bin/rpl_run.sh | 64 ++++++++-- bin/sqlitedb.py | 12 +- bin/tblextr.py | 76 ++++++++--- bin/txt2xml.sh | 27 ++-- inc/rocprofiler.h | 3 +- src/core/intercept_queue.h | 25 ++-- src/core/rocprofiler.cpp | 12 +- src/core/tracker.h | 8 +- src/util/hsa_rsrc_factory.cpp | 59 +++++++++ src/util/hsa_rsrc_factory.h | 19 +++ test/CMakeLists.txt | 3 + test/app/intercept_test_stand.cpp | 2 +- test/ocl/SimpleConvolution | Bin 0 -> 132704 bytes test/ocl/SimpleConvolution_Kernels.cl | 175 ++++++++++++++++++++++++++ test/run.sh | 8 +- test/tool/tool.cpp | 19 +-- test/util/hsa_rsrc_factory.cpp | 97 +++++++++++--- test/util/hsa_rsrc_factory.h | 51 +++++--- 20 files changed, 577 insertions(+), 105 deletions(-) create mode 100755 test/ocl/SimpleConvolution create mode 100644 test/ocl/SimpleConvolution_Kernels.cl diff --git a/CMakeLists.txt b/CMakeLists.txt index 18bbee13..8b81b5a6 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -126,7 +126,7 @@ set ( CPACK_DEBIAN_PACKAGE_CONTROL_EXTRA "${CMAKE_CURRENT_SOURCE_DIR}/DEBIAN/pos ## RPM package specific variables set ( CPACK_RPM_PACKAGE_DEPENDS "hsa-rocr-dev" ) -set ( CPACK_RPM_PRE_INSTALL_SCRIPT_FILE "${CMAKE_CURRENT_SOURCE_DIR}/RPM/rpm_post" ) +set ( CPACK_RPM_POST_INSTALL_SCRIPT_FILE "${CMAKE_CURRENT_SOURCE_DIR}/RPM/rpm_post" ) set ( CPACK_RPM_POST_UNINSTALL_SCRIPT_FILE "${CMAKE_CURRENT_SOURCE_DIR}/RPM/rpm_postun" ) include ( CPack ) diff --git a/bin/dform.py b/bin/dform.py index f797d637..93194608 100644 --- a/bin/dform.py +++ b/bin/dform.py @@ -1,13 +1,18 @@ #!/usr/bin/python from sqlitedb import SQLiteDB -def post_process_data(db, table_name, outfile = ''): +def gen_message(outfile): + if outfile != '': + print("File '" + outfile + "' is generating") + +def post_process_data(db, table_name, outfile = ''): # db.add_data_column('A', 'DispDurNs', 'INTEGER', 'BeginNs - DispatchNs') # db.add_data_column('A', 'ComplDurNs', 'INTEGER', 'CompleteNs - EndNs') # db.add_data_column('A', 'TotalDurNs', 'INTEGER', 'CompleteNs - DispatchNs') # db.add_data_column(table_name, 'TimeNs', 'INTEGER', 'BeginNs - %d' % start_ns) db.add_data_column(table_name, 'DurationNs', 'INTEGER', 'EndNs - BeginNs') if outfile != '': db.dump_csv(table_name, outfile) + gen_message(outfile) def gen_data_bins(db, outfile): db.execute('create view C as select Name, Calls, TotalDurationNs, TotalDurationNs/Calls as AverageNs, TotalDurationNs*100.0/(select sum(TotalDurationNs) from %s) as Percentage from %s order by TotalDurationNs desc;' % ('B', 'B')); @@ -18,24 +23,29 @@ def gen_table_bins(db, table, outfile, name_var, dur_ns_var): db.execute('create view B as select (%s) as Name, count(%s) as Calls, sum(%s) as TotalDurationNs from %s group by %s' % (name_var, name_var, dur_ns_var, table, name_var)) gen_data_bins(db, outfile) db.execute('DROP VIEW B') + gen_message(outfile) def gen_api_json_trace(db, table, start_us, outfile): - db.execute('create view B as select "Index", Name as name, pid, tid, (BeginNs/1000 - %d) as ts, (DurationNs/1000) as dur from %s order by ts asc;' % (start_us, table)); + db.execute('create view B as select "Index", Name as name, pid, tid, (BeginNs/1000 - %d) as ts, (DurationNs/1000) as dur from %s;' % (start_us, table)); db.dump_json('B', table, outfile) db.execute('DROP VIEW B') + gen_message(outfile) def gen_ext_json_trace(db, table, start_us, outfile): - db.execute('create view B as select Name as name, pid, tid, (BeginNs/1000 - %d) as ts, ((EndNs - BeginNs)/1000) as dur from %s order by ts asc;' % (start_us, table)); + db.execute('create view B as select Name as name, pid, tid, (BeginNs/1000 - %d) as ts, ((EndNs - BeginNs)/1000) as dur from %s;' % (start_us, table)); db.dump_json('B', table, outfile) db.execute('DROP VIEW B') + gen_message(outfile) def gen_ops_json_trace(db, table, base_pid, start_us, outfile): - db.execute('create view B as select "Index", Name as name, ("dev-id" + %d) as pid, tid, (BeginNs/1000 - %d) as ts, (DurationNs/1000) as dur from %s order by ts asc;' % (base_pid, start_us, table)); + db.execute('create view B as select "Index", Name as name, ("dev-id" + %d) as pid, tid, (BeginNs/1000 - %d) as ts, (DurationNs/1000) as dur from %s;' % (base_pid, start_us, table)); db.dump_json('B', table, outfile) db.execute('DROP VIEW B') + gen_message(outfile) def gen_kernel_json_trace(db, table, base_pid, start_us, outfile): - db.execute('create view B as select "Index", KernelName as name, ("gpu-id" + %d) as pid, (0) as tid, (BeginNs/1000 - %d) as ts, (DurationNs/1000) as dur from %s order by ts asc;' % (base_pid, start_us, table)); + db.execute('create view B as select "Index", KernelName as name, ("gpu-id" + %d) as pid, (0) as tid, (BeginNs/1000 - %d) as ts, (DurationNs/1000) as dur from %s;' % (base_pid, start_us, table)); db.dump_json('B', table, outfile) db.execute('DROP VIEW B') + gen_message(outfile) ############################################################################################## diff --git a/bin/rpl_run.sh b/bin/rpl_run.sh index 6a7bbf80..86383d14 100755 --- a/bin/rpl_run.sh +++ b/bin/rpl_run.sh @@ -46,6 +46,7 @@ fi # runtime API trace ROCTX_TRACE=0 +KFD_TRACE=0 HSA_TRACE=0 SYS_TRACE=0 HIP_TRACE=0 @@ -53,6 +54,9 @@ HIP_TRACE=0 # Generate stats GEN_STATS=0 +# Quoting profiled cmd line +CMD_QTS=1 + export PATH=.:$PATH # enable error logging @@ -91,6 +95,13 @@ error() { exit 1 } +error_message="" +errck() { + if [ -n "$error_message" ]; then + fatal "$1 : $error_message" + fi +} + # usage method usage() { bin_name=`basename $0` @@ -109,7 +120,7 @@ usage() { echo " --cmd-qts - quoting profiled cmd line [on]" echo "" echo " -i <.txt|.xml file> - input file" - echo " Input file .txt format, automatically rerun application for every pmc line:" + echo " Input file .txt format, automatically rerun application for every profiling features line:" echo "" echo " # Perf counters group 1" echo " pmc : Wavefronts VALUInsts SALUInsts SFetchInsts FlatVMemInsts LDSInsts FlatLDSInsts GDSInsts VALUUtilization FetchSize" @@ -154,6 +165,7 @@ usage() { echo "" echo " --stats - generating kernel execution stats, file .stats.csv" echo " --roctx-trace - to enable rocTX trace" + echo " --kfd-trace - to trace KFD, generates API execution stats and JSON file chrome-tracing compatible" echo " --hsa-trace - to trace HSA, generates API execution stats and JSON file chrome-tracing compatible" echo " --sys-trace - to trace HIP/HSA APIs and GPU activity, generates stats and JSON trace chrome-tracing compatible" echo " --hip-trace - to trace HIP, generates API execution stats and JSON file chrome-tracing compatible" @@ -167,6 +179,10 @@ usage() { echo " " echo " " echo "" + echo " --trace-period - to enable trace with initial delay, with periodic sample length and rate" + echo " Supported time formats: " + echo " --obj-tracking - to turn on/off kernels code objects tracking [off]" + echo "" echo "Configuration file:" echo " You can set your parameters defaults preferences in the configuration file 'rpl_rc.xml'. The search path sequence: .:${HOME}:" echo " First the configuration file is looking in the current directory, then in your home, and then in the package directory." @@ -212,7 +228,6 @@ run() { fi fi mkdir -p "$ROCP_OUTPUT_DIR" - OUTPUT_LIST="$OUTPUT_LIST $ROCP_OUTPUT_DIR/results.txt" fi @@ -220,8 +235,9 @@ run() { if [ "$ROCTX_TRACE" = 1 ] ; then API_TRACE=${API_TRACE}":roctx" fi - if [ "$HSA_TRACE" = 1 ] ; then - API_TRACE=${API_TRACE}":hsa" + if [ "$KFD_TRACE" = 1 ] ; then + API_TRACE=${API_TRACE}":kfd" + export LD_PRELOAD="libkfdwrapper64.so libhsakmt.so.1" fi if [ "$HIP_TRACE" = 1 ] ; then API_TRACE=${API_TRACE}":hip" @@ -230,11 +246,12 @@ run() { API_TRACE=${API_TRACE}":sys" fi - if [ -n "$API_TRACE" ] ; then + if [ "$HSA_TRACE" = 1 ] ; then + export ROCTRACER_DOMAIN=$API_TRACE":hsa" + export HSA_TOOLS_LIB="$HSA_TOOLS_LIB $TTLIB_PATH/libtracer_tool.so" + elif [ -n "$API_TRACE" ] ; then export ROCTRACER_DOMAIN=$API_TRACE - if [ "$HSA_TRACE" = 0 ] ; then - OUTPUT_LIST="$ROCP_OUTPUT_DIR/" - fi + OUTPUT_LIST="$ROCP_OUTPUT_DIR/" export HSA_TOOLS_LIB="$TTLIB_PATH/libtracer_tool.so" fi @@ -245,6 +262,8 @@ run() { CMD_LINE="$APP_CMD $redirection_cmd" eval "$CMD_LINE" + + unset LD_PRELOAD } merge_output() { @@ -353,6 +372,11 @@ while [ 1 ] ; do elif [ "$1" = "--roctx-trace" ] ; then ARG_VAL=0 ROCTX_TRACE=1 + elif [ "$1" = "--kfd-trace" ] ; then + ARG_VAL=0 + export ROCP_TIMESTAMP_ON=1 + GEN_STATS=1 + KFD_TRACE=1 elif [ "$1" = "--hsa-trace" ] ; then ARG_VAL=0 export ROCP_TIMESTAMP_ON=1 @@ -368,6 +392,26 @@ while [ 1 ] ; do export ROCP_TIMESTAMP_ON=1 GEN_STATS=1 HIP_TRACE=1 + elif [ "$1" = "--trace-period" ] ; then + period_expr="^\([^:]*\):\([^:]*\):\([^:]*\)$" + period_ck=`echo "$2" | sed -n "s/"${period_expr}"/ok/p"` + if [ -z "$period_ck" ] ; then + fatal "Wrong option '$1 $2'" + fi + period_delay=`echo "$2" | sed -n "s/"${period_expr}"/\1/p"` + period_len=`echo "$2" | sed -n "s/"${period_expr}"/\2/p"` + period_rate=`echo "$2" | sed -n "s/"${period_expr}"/\3/p"` + convert_time_val period_delay + errck "Option '$ARG_IN', delay value" + convert_time_val period_len + errck "Option '$ARG_IN', length value" + convert_time_val period_rate + errck "Option '$ARG_IN', rate value" + export ROCP_CTRL_RATE="$period_delay:$period_len:$period_rate" + elif [ "$1" = "--obj-tracking" ] ; then + if [ "$2" = "on" ] ; then + export ROCP_OBJ_TRACKING=1 + fi elif [ "$1" = "--verbose" ] ; then ARG_VAL=0 export ROCP_VERBOSE_MODE=1 @@ -475,9 +519,7 @@ if [ -n "$csv_output" ] ; then else python $BIN_DIR/tblextr.py $csv_output $OUTPUT_LIST fi - if [ "$?" -eq 0 ] ; then - echo "RPL: '$csv_output' is generated" - else + if [ "$?" -ne 0 ] ; then echo "Data extracting error: $OUTPUT_LIST'" fi fi diff --git a/bin/sqlitedb.py b/bin/sqlitedb.py index e02d4136..cd649e6a 100644 --- a/bin/sqlitedb.py +++ b/bin/sqlitedb.py @@ -6,7 +6,7 @@ class SQLiteDB: def __init__(self, file_name): self.connection = sqlite3.connect(file_name) self.tables = {} - self.json_arg_list_enabled = 0 + self.section_index = 0 def __del__(self): self.connection.close() @@ -83,6 +83,9 @@ def _get_raw_by_id(self, table_name, req_id): raise Exception('Index is not unique, table "' + table_name + '"') return list(raws[0]) + def table_get_raws(self, table_name): + return self._get_raws(table_name) + # dump CSV table def dump_csv(self, table_name, file_name): if not re.search(r'\.csv$', file_name): @@ -111,7 +114,8 @@ def label_json(self, pid, label, file_name): if not re.search(r'\.json$', file_name): raise Exception('wrong output file type: "' + file_name + '"' ) with open(file_name, mode='a') as fd: - fd.write(',{"args":{"name":"%s"},"ph":"M","pid":%s,"name":"process_name"}\n' %(label, pid)); + fd.write(',{"args":{"name":"%s %s"},"ph":"M","pid":%s,"name":"process_name"}\n' %(self.section_index, label, pid)); + self.section_index += 1 def flow_json(self, base_id, from_pid, from_tid, from_us_list, to_pid, to_us_dict, corr_id_list, start_us, file_name): if not re.search(r'\.json$', file_name): @@ -137,9 +141,9 @@ def dump_json(self, table_name, data_name, file_name): name_ptrn = re.compile(r'(name|Name)') table_fields = self._get_fields(table_name) - table_raws = self._get_raws_indexed(table_name) + table_raws = self._get_raws(table_name) data_fields = self._get_fields(data_name) - data_raws = self._get_raws_indexed(data_name) + data_raws = self._get_raws(data_name) with open(file_name, mode='a') as fd: table_raws_len = len(table_raws) diff --git a/bin/tblextr.py b/bin/tblextr.py index 895f41b1..490cdb8b 100755 --- a/bin/tblextr.py +++ b/bin/tblextr.py @@ -32,15 +32,20 @@ # SQ_WAVES (4096) # SQ_INSTS_VMEM_RD (36864) -COPY_PID = 0 -OPS_PID = 1 -HSA_PID = 2 -HIP_PID = 3 -EXT_PID = 4 -GPU_BASE_PID = 5 +EXT_PID = 0 +COPY_PID = 1 +HIP_PID = 2 +HSA_PID = 3 +KFD_PID = 4 +OPS_PID = 5 +GPU_BASE_PID = 6 +NONE_PID = -1 + max_gpu_id = -1 START_US = 0 +hsa_activity_found = 0 + # dependencies dictionary dep_dict = {} kern_dep_list = [] @@ -166,6 +171,8 @@ def dump_csv(file_name): if ind != dispatch_number: fatal("Dispatch #" + ind + " index mismatch (" + dispatch_number + ")\n") val_list = [entry[var] for var in var_list] fd.write(','.join(val_list) + '\n'); + + print("File '" + file_name + "' is generating") ############################################################# # fill kernels DB @@ -249,6 +256,12 @@ def fill_ext_db(table_name, db, indir, trace_name, api_pid): {'BeginNs':'INTEGER', 'EndNs':'INTEGER', 'pid':'INTEGER', 'tid':'INTEGER', 'Name':'TEXT', 'args':'TEXT', 'Index':'INTEGER'} ] def fill_api_db(table_name, db, indir, api_name, api_pid, dep_pid, dep_list, dep_filtr, expl_id): + global hsa_activity_found + copy_raws = [] + if (hsa_activity_found): copy_raws = db.table_get_raws('COPY') + copy_csv = '' + copy_index = 0 + file_name = indir + '/' + api_name + '_api_trace.txt' ptrn_val = re.compile(r'(\d+):(\d+) (\d+):(\d+) ([^\(]+)(\(.*)$') ptrn_ac = re.compile(r'hsa_amd_memory_async_copy') @@ -288,6 +301,16 @@ def fill_api_db(table_name, db, indir, api_name, api_pid, dep_pid, dep_list, dep dep_from_us_list.append(from_us) dep_tid_list.append(int(rec_vals[3])) dep_id_list.append(record_id) + + if len(copy_raws) != 0: + copy_data = list(copy_raws[copy_index]) + args_str = rec_vals[5] + args_str = re.sub(r'\(', r'', args_str) + args_str = re.sub(r'\).*$', r'', args_str) + copy_line = str(copy_data[0]) + ', ' + str(copy_data[1]) + ', ' + rec_vals[4] + ', ' + args_str + copy_csv += str(copy_index) + ', ' + copy_line + '\n' + copy_index += 1 + record_id += 1 else: fatal(api_name + " bad record: '" + record + "'") @@ -295,11 +318,18 @@ def fill_api_db(table_name, db, indir, api_name, api_pid, dep_pid, dep_list, dep db.insert_entry(table_handle, [from_ns, from_ns, api_pid, tid, 'hsa_dispatch', '', record_id]) record_id += 1 - if not dep_pid in dep_dict: dep_dict[dep_pid] = {} - dep_dict[dep_pid]['pid'] = api_pid - dep_dict[dep_pid]['tid'] = dep_tid_list - dep_dict[dep_pid]['from'] = dep_from_us_list - if expl_id: dep_dict[dep_pid]['id'] = dep_id_list + if dep_pid != NONE_PID: + if not dep_pid in dep_dict: dep_dict[dep_pid] = {} + dep_dict[dep_pid]['pid'] = api_pid + dep_dict[dep_pid]['tid'] = dep_tid_list + dep_dict[dep_pid]['from'] = dep_from_us_list + if expl_id: dep_dict[dep_pid]['id'] = dep_id_list + + if copy_csv != '': + file_name = os.environ['PWD'] + '/results_mcopy.csv' + with open(file_name, mode='w') as fd: + print("File '" + file_name + "' is generating") + fd.write(copy_csv) return 1 ############################################################# @@ -337,6 +367,8 @@ def fill_copy_db(table_name, db, indir): else: fatal("async-copy bad record: '" + record + "'") dep_dict[COPY_PID]['to'] = dep_to_us_dict + + return 1 ############################################################# # fill HCC ops DB @@ -419,29 +451,35 @@ def fill_ops_db(table_name, db, indir): ext_trace_found = fill_ext_db('rocTX', db, indir, 'roctx', EXT_PID) - hsa_trace_found = fill_api_db('HSA', db, indir, 'hsa', HSA_PID, COPY_PID, kern_dep_list, {}, 0) + kfd_trace_found = fill_api_db('KFD', db, indir, 'kfd', KFD_PID, NONE_PID, [], {}, 0) + hsa_activity_found = fill_copy_db('COPY', db, indir) + hsa_trace_found = fill_api_db('HSA', db, indir, 'hsa', HSA_PID, COPY_PID, kern_dep_list, {}, 0) ops_filtr = fill_ops_db('OPS', db, indir) hip_trace_found = fill_api_db('HIP', db, indir, 'hip', HIP_PID, OPS_PID, [], ops_filtr, 1) fill_kernel_db('A', db) - any_trace_found = ext_trace_found | hsa_trace_found | hip_trace_found + any_trace_found = ext_trace_found | kfd_trace_found | hsa_trace_found | hip_trace_found if any_trace_found: db.open_json(jsonfile) if ext_trace_found: db.label_json(EXT_PID, "Markers and Ranges", jsonfile) + if hip_trace_found: + db.label_json(HIP_PID, "CPU HIP API", jsonfile) + if hsa_trace_found: db.label_json(HSA_PID, "CPU HSA API", jsonfile) + + if kfd_trace_found: + db.label_json(KFD_PID, "CPU KFD API", jsonfile) + if hsa_activity_found: db.label_json(COPY_PID, "COPY", jsonfile) - if hip_trace_found: - db.label_json(HIP_PID, "CPU HIP API", jsonfile) - if any_trace_found and max_gpu_id >= 0: for ind in range(0, int(max_gpu_id) + 1): db.label_json(int(ind) + int(GPU_BASE_PID), "GPU" + str(ind), jsonfile) @@ -474,6 +512,12 @@ def fill_ops_db(table_name, db, indir): dform.post_process_data(db, 'OPS') dform.gen_ops_json_trace(db, 'OPS', GPU_BASE_PID, START_US, jsonfile) + if kfd_trace_found: + statfile = re.sub(r'stats', r'kfd_stats', statfile) + dform.post_process_data(db, 'KFD') + dform.gen_table_bins(db, 'KFD', statfile, 'Name', 'DurationNs') + dform.gen_api_json_trace(db, 'KFD', START_US, jsonfile) + if any_trace_found: for (to_pid, dep_str) in dep_dict.items(): if 'bsp' in dep_str: diff --git a/bin/txt2xml.sh b/bin/txt2xml.sh index 27bbe8c4..126337ed 100755 --- a/bin/txt2xml.sh +++ b/bin/txt2xml.sh @@ -64,29 +64,32 @@ parse() { gpu_index=$line fi else - output=$outdir/input${index}.xml - header="# $timestamp '$output' generated with '$0 $*'" - echo $header > $output + found=$(echo $feature | sed -n "/^\(pmc\|sqtt\|hsa\)$/ p") + if [ -n "$found" ] ; then + output=$outdir/input${index}.xml + header="# $timestamp '$output' generated with '$0 $*'" + echo $header > $output - if [ "$feature" == "pmc" ] ; then - line=`echo "$line" | sed -e "s/ /,/g"` - cat >> $output <> $output < EOF - fi + fi - if [ "$feature" == "sqtt" ] ; then - cat >> $output <> $output < EOF - fi + fi - if [ "$feature" == "hsa" ] ; then - cat >> $output <> $output < EOF + fi fi fi diff --git a/inc/rocprofiler.h b/inc/rocprofiler.h index b59acfdf..313f7f42 100644 --- a/inc/rocprofiler.h +++ b/inc/rocprofiler.h @@ -64,6 +64,7 @@ uint32_t rocprofiler_version_minor(); typedef struct { uint32_t intercept_mode; + uint32_t code_obj_tracking; uint32_t memcopy_tracking; uint32_t trace_size; uint32_t trace_local; @@ -222,7 +223,7 @@ typedef struct { hsa_signal_t completion_signal; // Completion signal const hsa_kernel_dispatch_packet_t* packet; // HSA dispatch packet const char* kernel_name; // Kernel name - uint64_t kernel_object; // Kernel object pointer + uint64_t kernel_object; // Kernel object address const amd_kernel_code_t* kernel_code; // Kernel code pointer int64_t thread_id; // Thread id const rocprofiler_dispatch_record_t* record; // Dispatch record diff --git a/src/core/intercept_queue.h b/src/core/intercept_queue.h index 5a6234ab..f639b3e5 100644 --- a/src/core/intercept_queue.h +++ b/src/core/intercept_queue.h @@ -148,9 +148,20 @@ class InterceptQueue { } // Prepareing dispatch callback data - const amd_kernel_code_t* kernel_code = GetKernelCode(dispatch_packet); - const uint64_t kernel_symbol = kernel_code->runtime_loader_kernel_symbol; - const char* kernel_name = GetKernelName(kernel_symbol); + uint64_t kernel_object = dispatch_packet->kernel_object; + const amd_kernel_code_t* kernel_code = GetKernelCode(kernel_object); + + const uint16_t kernel_object_flag = *((uint64_t*)kernel_code + 1); + if (kernel_object_flag == 0) { + if (!util::HsaRsrcFactory::IsExecutableTracking()) { + fprintf(stderr, "Error: V3 code object detected - code objects tracking should be enabled\n"); + abort(); + } + } + const char* kernel_name = (util::HsaRsrcFactory::IsExecutableTracking()) ? + util::HsaRsrcFactory::GetKernelName(kernel_object) : + GetKernelName(kernel_code->runtime_loader_kernel_symbol); + rocprofiler_callback_data_t data = {obj->agent_info_->dev_id, obj->agent_info_->dev_index, obj->queue_, @@ -159,7 +170,7 @@ class InterceptQueue { completion_signal, dispatch_packet, kernel_name, - kernel_symbol, + kernel_object, kernel_code, syscall(__NR_gettid), (tracker_entry) ? tracker_entry->record : NULL}; @@ -243,14 +254,14 @@ class InterceptQueue { return static_cast((*header >> HSA_PACKET_HEADER_TYPE) & header_type_mask); } - static const amd_kernel_code_t* GetKernelCode(const hsa_kernel_dispatch_packet_t* dispatch_packet) { + static const amd_kernel_code_t* GetKernelCode(uint64_t kernel_object) { const amd_kernel_code_t* kernel_code = NULL; hsa_status_t status = util::HsaRsrcFactory::Instance().LoaderApi()->hsa_ven_amd_loader_query_host_address( - reinterpret_cast(dispatch_packet->kernel_object), + reinterpret_cast(kernel_object), reinterpret_cast(&kernel_code)); if (HSA_STATUS_SUCCESS != status) { - kernel_code = reinterpret_cast(dispatch_packet->kernel_object); + kernel_code = reinterpret_cast(kernel_object); } return kernel_code; } diff --git a/src/core/rocprofiler.cpp b/src/core/rocprofiler.cpp index cbfbc432..61fd4619 100644 --- a/src/core/rocprofiler.cpp +++ b/src/core/rocprofiler.cpp @@ -144,7 +144,8 @@ void * tool_handle = NULL; // Return true if intercepting mode is enabled enum { DISPATCH_INTERCEPT_MODE = 0x1, - MEMCOPY_INTERCEPT_MODE = 0x2 + CODE_OBJ_TRACKING_MODE = 0x2, + MEMCOPY_INTERCEPT_MODE = 0x4, }; uint32_t LoadTool() { uint32_t intercept_mode = 0; @@ -188,6 +189,7 @@ uint32_t LoadTool() { util::HsaRsrcFactory::SetTimeoutNs(settings.timeout); InterceptQueue::TrackerOn(settings.timestamp_on != 0); if (settings.intercept_mode != 0) intercept_mode = DISPATCH_INTERCEPT_MODE; + if (settings.code_obj_tracking) intercept_mode |= CODE_OBJ_TRACKING_MODE; if (settings.memcopy_tracking) intercept_mode |= MEMCOPY_INTERCEPT_MODE; } @@ -432,7 +434,13 @@ PUBLIC_API bool OnLoad(HsaApiTable* table, uint64_t runtime_version, uint64_t fa // Loading a tool lib and setting of intercept mode const uint32_t intercept_mode_mask = rocprofiler::LoadTool(); - if (intercept_mode_mask & rocprofiler::DISPATCH_INTERCEPT_MODE) intercept_mode = true; + if (intercept_mode_mask & rocprofiler::DISPATCH_INTERCEPT_MODE) { + intercept_mode = true; + } + if (intercept_mode_mask & rocprofiler::CODE_OBJ_TRACKING_MODE) { + if (intercept_mode == false) EXC_RAISING(HSA_STATUS_ERROR, "code objects tracking without intercept mode enabled"); + rocprofiler::util::HsaRsrcFactory::EnableExecutableTracking(table); + } if (intercept_mode_mask & rocprofiler::MEMCOPY_INTERCEPT_MODE) { hsa_status_t status = hsa_amd_profiling_async_copy_enable(true); if (status != HSA_STATUS_SUCCESS) EXC_ABORT(status, "hsa_amd_profiling_async_copy_enable"); diff --git a/src/core/tracker.h b/src/core/tracker.h index c4d619c9..e366c761 100644 --- a/src/core/tracker.h +++ b/src/core/tracker.h @@ -105,9 +105,10 @@ class Tracker { entry->record = record; // Creating a proxy signal - status = hsa_api_.hsa_signal_create(1, 0, NULL, &(entry->signal)); + const hsa_signal_value_t signal_value = (orig.handle) ? hsa_api_.hsa_signal_load_relaxed(orig) : 1; + status = hsa_api_.hsa_signal_create(signal_value, 0, NULL, &(entry->signal)); if (status != HSA_STATUS_SUCCESS) EXC_RAISING(status, "hsa_signal_create"); - status = hsa_api_.hsa_amd_signal_async_handler(entry->signal, HSA_SIGNAL_CONDITION_LT, 1, Handler, entry); + status = hsa_api_.hsa_amd_signal_async_handler(entry->signal, HSA_SIGNAL_CONDITION_LT, signal_value, Handler, entry); if (status != HSA_STATUS_SUCCESS) EXC_RAISING(status, "hsa_amd_signal_async_handler"); // Adding antry to the list @@ -210,9 +211,6 @@ class Tracker { amd_signal_t* prof_signal_ptr = reinterpret_cast(entry->signal.handle); orig_signal_ptr->start_ts = prof_signal_ptr->start_ts; orig_signal_ptr->end_ts = prof_signal_ptr->end_ts; - - const hsa_signal_value_t new_value = hsa_api_.hsa_signal_load_relaxed(orig) - 1; - if (signal_value != new_value) EXC_ABORT(HSA_STATUS_ERROR, "Tracker::Complete bad signal value"); hsa_api_.hsa_signal_store_screlease(orig, signal_value); } } diff --git a/src/util/hsa_rsrc_factory.cpp b/src/util/hsa_rsrc_factory.cpp index a47062dd..65f94357 100644 --- a/src/util/hsa_rsrc_factory.cpp +++ b/src/util/hsa_rsrc_factory.cpp @@ -193,6 +193,8 @@ void HsaRsrcFactory::InitHsaApiTable(HsaApiTable* table) { hsa_api_.hsa_executable_load_agent_code_object = table->core_->hsa_executable_load_agent_code_object_fn; hsa_api_.hsa_executable_freeze = table->core_->hsa_executable_freeze_fn; hsa_api_.hsa_executable_get_symbol = table->core_->hsa_executable_get_symbol_fn; + hsa_api_.hsa_executable_symbol_get_info = table->core_->hsa_executable_symbol_get_info_fn; + hsa_api_.hsa_executable_iterate_symbols = table->core_->hsa_executable_iterate_symbols_fn; hsa_api_.hsa_system_get_info = table->core_->hsa_system_get_info_fn; hsa_api_.hsa_system_get_major_extension_table = table->core_->hsa_system_get_major_extension_table_fn; @@ -231,6 +233,8 @@ void HsaRsrcFactory::InitHsaApiTable(HsaApiTable* table) { hsa_api_.hsa_executable_load_agent_code_object = hsa_executable_load_agent_code_object; hsa_api_.hsa_executable_freeze = hsa_executable_freeze; hsa_api_.hsa_executable_get_symbol = hsa_executable_get_symbol; + hsa_api_.hsa_executable_symbol_get_info = hsa_executable_symbol_get_info; + hsa_api_.hsa_executable_iterate_symbols = hsa_executable_iterate_symbols; hsa_api_.hsa_system_get_info = hsa_system_get_info; hsa_api_.hsa_system_get_major_extension_table = hsa_system_get_major_extension_table; @@ -337,6 +341,11 @@ const AgentInfo* HsaRsrcFactory::AddAgentInfo(const hsa_agent_t agent) { status = hsa_api_.hsa_amd_agent_iterate_memory_pools(agent, FindStandardPool, &agent_info->gpu_pool); CHECK_ITER_STATUS("hsa_amd_agent_iterate_memory_pools(gpu pool)", status); + // GFX8 and GFX9 SGPR/VGPR block sizes + agent_info->sgpr_block_dflt = (strcmp(agent_info->gfxip, "gfx8") == 0) ? 1 : 2; + agent_info->sgpr_block_size = 8; + agent_info->vgpr_block_size = 4; + // Set GPU index agent_info->dev_index = gpu_list_.size(); gpu_list_.push_back(agent_info); @@ -681,10 +690,60 @@ uint64_t HsaRsrcFactory::Submit(hsa_queue_t* queue, const void* packet, size_t s return write_idx; } +const char* HsaRsrcFactory::GetKernelName(uint64_t addr) { + std::lock_guard lck(mutex_); + const auto it = symbols_map_->find(addr); + if (it == symbols_map_->end()) { + fprintf(stderr, "HsaRsrcFactory::kernel addr (0x%lx) is not found\n", addr); + abort(); + } + return strdup(it->second); +} + +void HsaRsrcFactory::EnableExecutableTracking(HsaApiTable* table) { + std::lock_guard lck(mutex_); + executable_tracking_on_ = true; + table->core_->hsa_executable_freeze_fn = hsa_executable_freeze_interceptor; +} + +hsa_status_t HsaRsrcFactory::executable_symbols_cb(hsa_executable_t exec, hsa_executable_symbol_t symbol, void *data) { + hsa_symbol_kind_t value = (hsa_symbol_kind_t)0; + hsa_status_t status = hsa_api_.hsa_executable_symbol_get_info(symbol, HSA_EXECUTABLE_SYMBOL_INFO_TYPE, &value); + CHECK_STATUS("Error in getting symbol info", status); + if (value == HSA_SYMBOL_KIND_KERNEL) { + uint64_t addr = 0; + uint32_t len = 0; + status = hsa_api_.hsa_executable_symbol_get_info(symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT, &addr); + CHECK_STATUS("Error in getting kernel object", status); + status = hsa_api_.hsa_executable_symbol_get_info(symbol, HSA_EXECUTABLE_SYMBOL_INFO_NAME_LENGTH, &len); + CHECK_STATUS("Error in getting name len", status); + char *name = new char[len + 1]; + status = hsa_api_.hsa_executable_symbol_get_info(symbol, HSA_EXECUTABLE_SYMBOL_INFO_NAME, name); + CHECK_STATUS("Error in getting kernel name", status); + name[len] = 0; + auto ret = symbols_map_->insert({addr, name}); + if (ret.second == false) { + delete[] ret.first->second; + ret.first->second = name; + } + } + return HSA_STATUS_SUCCESS; +} + +hsa_status_t HsaRsrcFactory::hsa_executable_freeze_interceptor(hsa_executable_t executable, const char *options) { + std::lock_guard lck(mutex_); + if (symbols_map_ == NULL) symbols_map_ = new symbols_map_t; + hsa_status_t status = hsa_api_.hsa_executable_iterate_symbols(executable, executable_symbols_cb, NULL); + CHECK_STATUS("Error in iterating executable symbols", status); + return hsa_api_.hsa_executable_freeze(executable, options);; +} + std::atomic HsaRsrcFactory::instance_{}; HsaRsrcFactory::mutex_t HsaRsrcFactory::mutex_; HsaRsrcFactory::timestamp_t HsaRsrcFactory::timeout_ns_ = HsaTimer::TIMESTAMP_MAX; hsa_pfn_t HsaRsrcFactory::hsa_api_{}; +bool HsaRsrcFactory::executable_tracking_on_ = false; +HsaRsrcFactory::symbols_map_t* HsaRsrcFactory::symbols_map_ = NULL; } // namespace util } // namespace rocprofiler diff --git a/src/util/hsa_rsrc_factory.h b/src/util/hsa_rsrc_factory.h index af031895..bf7f5fcf 100644 --- a/src/util/hsa_rsrc_factory.h +++ b/src/util/hsa_rsrc_factory.h @@ -95,6 +95,8 @@ struct hsa_pfn_t { decltype(hsa_executable_load_agent_code_object)* hsa_executable_load_agent_code_object; decltype(hsa_executable_freeze)* hsa_executable_freeze; decltype(hsa_executable_get_symbol)* hsa_executable_get_symbol; + decltype(hsa_executable_symbol_get_info)* hsa_executable_symbol_get_info; + decltype(hsa_executable_iterate_symbols)* hsa_executable_iterate_symbols; decltype(hsa_system_get_info)* hsa_system_get_info; decltype(hsa_system_get_major_extension_table)* hsa_system_get_major_extension_table; @@ -160,6 +162,11 @@ struct AgentInfo { // Number of Shader Arrays Per Shader Engines in Gpu uint32_t shader_arrays_per_se; + + // SGPR/VGPR block sizes + uint32_t sgpr_block_dflt; + uint32_t sgpr_block_size; + uint32_t vgpr_block_size; }; // HSA timer class @@ -323,6 +330,11 @@ class HsaRsrcFactory { static uint64_t Submit(hsa_queue_t* queue, const void* packet); static uint64_t Submit(hsa_queue_t* queue, const void* packet, size_t size_bytes); + // Enable executables loading tracking + static bool IsExecutableTracking() { return executable_tracking_on_; } + static void EnableExecutableTracking(HsaApiTable* table); + static const char* GetKernelName(uint64_t addr); + // Initialize HSA API table void static InitHsaApiTable(HsaApiTable* table); static const hsa_pfn_t* HsaApi() { return &hsa_api_; } @@ -387,6 +399,13 @@ class HsaRsrcFactory { // System agents map std::map agent_map_; + // Executables loading tracking + typedef std::map symbols_map_t; + static symbols_map_t* symbols_map_; + static bool executable_tracking_on_; + static hsa_status_t hsa_executable_freeze_interceptor(hsa_executable_t executable, const char *options); + static hsa_status_t executable_symbols_cb(hsa_executable_t exec, hsa_executable_symbol_t symbol, void *data); + // HSA runtime API table static hsa_pfn_t hsa_api_; diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index 7f128e86..9212f2af 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -88,3 +88,6 @@ set ( TEST_LIB_SRC ${TEST_DIR}/tool/tool.cpp ${UTIL_SRC} ) add_library ( ${TEST_LIB} SHARED ${TEST_LIB_SRC} ) target_include_directories ( ${TEST_LIB} PRIVATE ${TEST_DIR} ${ROOT_DIR} ${HSA_RUNTIME_INC_PATH} ) target_link_libraries ( ${TEST_LIB} ${ROCPROFILER_TARGET} ${HSA_RUNTIME_LIB} c stdc++ dl pthread rt ) + +## Copy OCL test +execute_process ( COMMAND sh -xc "cp -r ${TEST_DIR}/ocl ${PROJECT_BINARY_DIR}/test" ) diff --git a/test/app/intercept_test_stand.cpp b/test/app/intercept_test_stand.cpp index de3dbdaf..4f46f65e 100644 --- a/test/app/intercept_test_stand.cpp +++ b/test/app/intercept_test_stand.cpp @@ -73,7 +73,7 @@ void dump_context_entry(context_entry_t* entry) { const rocprofiler_dispatch_record_t* record = entry->data.record; fflush(stdout); - fprintf(stdout, "kernel symbol(0x%lx) name(\"%s\")", entry->data.kernel_object, kernel_name.c_str()); + fprintf(stdout, "kernel-object(0x%lx) name(\"%s\")", entry->data.kernel_object, kernel_name.c_str()); if (record) fprintf(stdout, ", gpu-id(%u), time(%lu,%lu,%lu,%lu)", HsaRsrcFactory::Instance().GetAgentInfo(entry->agent)->dev_index, record->dispatch, diff --git a/test/ocl/SimpleConvolution b/test/ocl/SimpleConvolution new file mode 100755 index 0000000000000000000000000000000000000000..be4c1332a279c55c80b0461daa2fb94f1f7344e7 GIT binary patch literal 132704 zcmb?^349bq_J0S0Mnoqny77+g=z=0997;Ua2}z)%K_kbSRWS)UNFx`{^=1-!`r`>m>;>2zjTcmKc1CsX}iRlR!e z)vH&ps;hhE=j4s+myn=YpMKgOHA0PJJpv`aHLYl0w2Vg^qaCDqv{dbQ?EsV-fPbr; zM_pOzURBLXlTenQ{`gPC|CoM+n0lB*(&GfAX;x~r&Xny%KOS`{)8rRqNh{T~RFO|C zdEK;1=5Jb6EbvzP0WJu6LC{9CCdA7$hJWR*X;p9c_O4-+k0 zd`DB3Q7G|u;GBFEYh+*GjTBs z|J4d{jY?~gdhuV6|JnF&!2h-Qw?5b5;(Gkg#s3ZXZ^Zus{NIRw>*L48P557k|A0aU zk%rXuZ7OX>dI$dRQm~cWgNu9dzgQveL;6?z-;e(#_j6EKH!{xjk>7&?{}QH_OA;rJu~H> z?-tBY>|Fbgi#}50y@vG$?iFz7scnF=^fH9aleiTuti_JAb(OtEcYXQ8oY1N6t(+GkeU9kAKv; z`>21eI-~BYL(aMP;D2veT;1|j>Es!|yjnWrpsaTu9rx8SkKteYs`He!Pj7zy+T0m0 z&X|AS+_5=tfBErenTK5CD|oeT?!&J?Qf(i|AC=7Z|4m`I)Bdg(I>2^yDEL)lcR&b+qrM+H3i2V z@!9B250tFh0JX*Z1P{@)!(-$;76-p`K5Pg)|Mr2g%U8wG1IKABJ_pBXcUv4fug1Y|jw9D6 zaqySH3CH56J5Ils$HD(2j$9iD#im~nhyS66#xDP09Qt?1p|duQyjgJ0vE&^Tr~LRh z{oWI&{K0YfIW>+RIPb*b=chRM6XW0)#u+a=Gg*VZ`YcgE@W9dY*{JArAi0ICS>L;pfCS{rg!QdwvUwiPgV(aqxG>;peD0 zdiyp``LS{Mc_)rPxjat!Yvbsxe;hiO#<9cW;_!cV9Q@cg{azAhynGf%pFhOuugl}~ z?~FL|J{m_qH^kvH4g93y-}$MH)9$%({O$X3_`e{I-bR9-RA-}r6VWfR@Mpy7uao2G z`Jp&?R~&v;q3u}qe@q-bB*gKPcORx{CuzyrlD}jM=~BeEru{>-d#ILny+>e1pCU8X z!=Im|9jmpiFBXVYiPr8{cH4NKM`2`gCGafYxv_WoJHY(O+M!y__p&_aKYm_Q?RH-7 z5pXlcAwTts&l<(2RsN4IO*;(yx6P3JLyh9|B=Dr)a*V{^qf7iTpmUgZjMk|5cdPRA zp&#<8ZI*mCD*WjvPdXhHlFmeh9}asK_^Ty;xWZqJei!m?kaP+ZKKDrW*P&YKryha1 zSP4)HI~3&=KMktwOCO@p}r4M@-F$oBQS3(d=>OZ`>fd{%Qq@~0D7i9w5ond zQFsk>l4H<62K;l}6(~DLcZkm)pa-7_u6IuCMQUIt{O+FPkIERLeY0Cxtw+~0{US;wyFL?b@6G&c%i&$ zO8*M29f1NYzr<1gZ@`n!v`Q)0{feI+^ds?3Km3l>UrRiK^wVnGnMz(w`G>Pq!}lxw zIQ#ue;Kew7PSW{SmA@B?qWvc;`?vhLhJD0KYRYD1<#F#YT`Q>?TVGjKnqONzy|#EJ z!0~0?tm;|bvIcMNtnzBDFn98V!qT$Zvgws|-m==s6SAtRXO&GZo>or1I8e{9r%gbt`#dgh+EU8Cy-luoQ zR?aG}y;k;VoF1K2=9Tp`Yo~*v66&(rN^hBx3ITwCa^h3Iv;B{?~xORB2t z%0!LHe?tCfS*6Yk)0`Qvl2%way`iD7rmVKEdRB2&rT5yx*=fJM%~91DK{@#sk)M%; zxnQ0{YG!q5nV?ZUvZmgfQ-W*3(Qgn!6z!#~5l~!RWlVitG|4-D3A9U!2Cy{mk`^M|)DyNGMApz*;zot-E52LQlnK=`dQPNOc zSW{nHRyez|)>~g(1)8N*`83JNliA_QCgUrURz1nf9`sh!R?jJ{Dw{RkTTxh6TU%W# z6@-ytWAeZ2BvPkc{Ra)B)GRo3DNb=!RdosUTUgdmQdZ-wtez!UOh2ozkfwo=Vh!I| zLu_JHYE?RSa^d8(!kkHG73NF1Pnn z)U{wN#BnW-fmAu#TUJ#yz0bJ(jl=5f+A=uR*=0Et3%!8;k|s5+xU{gOxXvpa<^Q0q zvrDRH))d#4MF;F}v1_Y)COJHI)_>)(BUJ3uM*c;!qZ7yfGK6LSqdz%@X#%6`%F1R; z*|m`Q+3D{Gl+*Vu3G)sb`&a%!@(J~PVr`R~WIz4mW3k%%_`Yo~{~c|u`r zSxr@O2@Gr|RVRnSfAB1$#7vV9y+&0RnQPMvE&ude3`(~5-+F6C8`1fOOwXF5SjTk2 zKtja~nA4nmo^Y1jU z$Q(VUv|3KlDkQLXa#Vx=$)!-Os2S1fenKKkEZY?G+uO-bqlpWJmDiT_o-yp4|3>p< zUzNZad&_brijj$Vq6A*p(ewWWe^UMbBP)nL87WM>XwoP~s0dw0mvdzzhqc`;{6_gk z*V^Zu>Z-b&N-nXcmy{IN$rV=Nlv#5yH6eVjtDjlM%n}tj7uFys^OhHuRLs!oFeR00 z<+M1hoO1^H03lJy4CWS=7gttkwPmEEVV=ZfslgW^uCV5Jtk){NS~*>wHnVJI$;=v& z)Lg5Ti}+ZZscO}g6wfLLsp3+cBblL<&tYV*p{I*WwdrMEZ{^Ig>hjWJED{RqZ0v#N z%<9=?z>-lrnq6I0 z&(P)Ux-xHlO;(=lvls@AkAqO25`+N4Xy=HwNQ z%gvjdGqG@F!^pxKSk4HnoGyXvrIoV^>+8x&dzTvBFq#a(b9c08xjI5o>Jpby-eje!d9*V>FST##y5hYpv_lVlDT77{X$1nH1GARFgHc zl-%T#ms;T@RS%qv%yICNk;PgsQFUIx zET$*tMuZyKOUrhN($8{M9aT|QRTF<4jH<1lb#e8qNoD9M?$~7I$+f+*uo%o+MjDq{ z6&+&%V-<`l7u81g${5kBoXjd5 zF?>|yDm{8RV)*C?8ckwvxgSoECgN`Z(gETs5$ARLvS9DwpxNw&gLxoiehcc9j#;A=$r1GL==_71%hcUsYRM}ZhVlyiaNm+;`qNdFrj>{*`DNY5XTQ#} z;XBoRG2(e{*!o!aJ&C`~LL%){_d|*2d1LEi-KQg-XZNg+b$^riJ1rzq>zn}b)N|yc z?oSc_bOZ@qsQWU+x7zU5{Yv5&+wj)?ed6cX@F{IlKe(+A-A6yEHat2s@)=^o%aI_n z(rkF^ECO@J*zj!I`gm-3b?ZuG={7uhvOf7Xyt*|nvI=Z?bbRDfWW(dALF7|m!_zBR zpBfup-SQJz4K_RsG4g4&;c<&D@>yuZ`jj;+sP8y>gxBA+ETJnth~pQScD zZrw#bEjGM$50aU!Hav!02 z{yZCATN}0iKicp~Hv9!Pe6kJivEfr}_)Hr<)rKEy!w<3HvuyY@8$R2HA7jJk*zg`3 zew+=j+wkLU_!&lhw`8Ir|4PRixUv0w|+3+)L z_zD}o#fH~?Unc3n0eAaP(sX~DcYyKc0h*@!)+epA6~blD=o?X_=aeRVyH51rim>3+ z?CinklNMPfdUs%9$Mk{*Ew1II~T_I|(y%Gg}0_gD^uj zbBTc8C(KaIY!>hq!VJ;OMghMaBC}b*6A7P6xKY3t5@rZw)(Chk;UR>J1bhzR zp@j1VJc2Mo9n&M=(+D%fG1CNmGGT@`W~zXXA3`0l!a}A&J>6;4Oq1ikOW8 zeuXeY5VJ4gr5hm{Yjf zCg7cfIdz*Y0^UKGQ?|K8!0!|0RBbj3cne`p#b%>`Um?sX*sKxoTEd)q%_0G>Aj~P( z%op&pggMok9sxf^m{Y8oCg4X1b80nH1$-Z2PN`tymlNg`YBmdaBH?nvjRL-qFsDqj zM!;hUa{@Ap1bhzRO2YX99zpnO!X5!XJMSc|ycw|gOqmq;Sr7c6FZgmR<|sYzMX4T4 zJcaH=4>Y+q0hiupyn9jtvkX_Arup0IFVcfAyW2$Zoq><^;DJ5Kz>d~~O%m&CZ6oda z$*gnweXJ8$SAPtl3AYjQjw^Sa*u*m0O3sve=w&w#AREKC0tntJ+AY;b4ibz6w$`7c zFIZ3hu9&)xRHnoI2G(_L;`NLh1y3*WdbMA*^*2J_KCEe7j~PF~YOMzJz&hiVcPZO; z<79aA@@CHas~6zVw9yaVZN)-_2VsJ%d2pHYzqu2=UOwF7H~p*Jz#CWTfgQT<)9(Dq zx$SLy0zz+J!v_-d_I92(H}-(k_Ye`Ix^)>g)TZcXEJdRG_8zJSKAU}PD zjpygybP^w})F5TW}n(CP-L)5?Wc2hRfE0_!5N5n|(b$x|;Y}l1jGBxkK6~#ze zN8w9WwEEZA^O~#aNfe!5=IeRI)pVt(TP_;@S8L5I)E&FxW+bgw;!AOle)jcjcQsug z3Lh;Bf85&2hK_7~9m(87eLXu`pTw2RkMW>U_tO4-J#V<0-bD@9ZJDCpo2{=wE#41T ze2JuWWmH90`CnX3H;eL@it_tfx5Dzg|7h(-GWQ8zPfzQ5T-Bc?`*%Ul?=Wk*no3c4 z{)h!VA7M1Kc0l{Cra9=?1wGq|Xc35VAX?voF4sl-dqGbM>bEWw1#(cp*R!JaNtCI7 zn+<5KX~?ma_w~Hf+6<;$O(~-AyRA0>s$aTG*`u%L!`38`zZc`r*Yi>9570{e!f5`c z)_0&;SJPIJzq$2gLjF=vLmBk%n$~O3JbG2+uWh}OkUu+;|9a~PQGc<>?`TaG_1%&D zU9HJNm5k~`pe;AldqVXd1Yig=z(w2x9EY-jl1^3Z3o)*Kn`YvL^sl8 zJ@9@Wax&1U9z4!X@?B53PHe=0(lT8yCb}0M7{T9W^xG}`yT50W_hdcrvvKDx68uOH ztX-|O!{2`gGOfJ2f_99x-@}*~^F4t)9`CEj>*;WXySvsH51kNfZzyx=8|pw!#;2D(6f`IU5A2K60*&$k`f|4^93!H&#D= z0_n^d7{x{a(O?=WRBCGD(kd%|Y+c*-HDe|uneFeo0?^7ziUiEKqs&-K@ z`}-uB`&Nx*OBw52<-WwXDK#K2_286F;~>=S&BGpaM6PdNvddo&g@Nu9q}!_(T}@SxOG=nz zmvBO;J~=P&l`--gA>jm=OG+Oh??D67FQjkF^j*+1cc?YrNk-na#5Jl=GBbD}1xse`${!`H(iJ^i7BTP#U4wyq8O19+&XaBZt$PNCf)3~vZ zmTG{Rknw$e5G36(#M%kIrC875Dk7HA`jr%A3A+H}DKGF1eQ4&?%&RgB&HKnB<6FV2 zb5tR+LU3x0EOWdBj2~#P!M=r`$H<8keo_L)rZ|N^uv30rQT}V3!s)a;<;YCV&c+)k z+03z+7n+`o!RY%kB`@%Y7*mj|=^p3WiVc6sa}ghs2~qwH?c`5m>JW`pao23)cRH9@v=|_}W-a8^UCi z7yNzK6Vkph?@ZAaPD<{&N5XPmJPHMk1?Zk;%%>}O<6ig{4Q7v(mIrWNsMMWildZ`; z2H5mAPH;)0U-u$v+XHjnn4Or06;if4H81dq9>{hl2anKw>k>mZ_UzlYZ_k#KI!bkG^t-;jqb{RUt6R5w)M{&`U$3FsOU;}lI!M=s6WimuBJS+ z0H>{o=DJf|epBf6N_PrpXX29e_O_PS?0{FPDxBE)CFw#f zqAl}RF;}3zsLJ{IAgeYbk!+NUh9b{I>)xQ5W1%S5i`nipq0NP8!dxb(VSOOulbmGp zcodRSVP0jgO6i^_C79o>=zC*4L|vT=z1(_kQYsNOQv=`Vfyod%kfbli6CQZrvJTqS0gc1K>$1|+IVQ)$o zAENrda`^|)e1qp2FBlYprG!$`!p#kt+KLzPCD}iPv5#PNyI~@t6BP9jna+6Edk9(^ zWTb;87#IY%aS{zehm{>Cb|75-!`Y5+eF35tInGyb2R(dpWFC7Al~+^|mx@MZ+|vwP zcsPoP8Aame61Nq>g9N7vR~{osbTvH&V)XvrDtdnQYuguK923@36Co-4)0_fhip|g9 z82o54`1umOAATGBh}a4HA+RHZ77=4IXrSp@E_((5<3W@(js{tqKG6eE=>TH~xFod| zVvNwRRNtTn>p$-LRPt@bU$D~!UmB3fW}i)NYzM(7`avhpo$P9=AAlDVj)!}G2W}rx zE}dwN9;$K|!8A5vU}AK34HoMWc*rFETspg^Oh^YC$)R)octt5lhWWM~;|^gnUiT0h zPJa3p)W$NS%AHTsAHq8b#-W1u$tVJ==d;aK#cU*>=3XSsT|Nyxt6Z9Ubt<1UC@bCQ zNf|Dz9riSY28|lo?gAQgHf-6Q?jng6 z+hMj{Xn>yiZ4}K#G_41FSz~NN%ncksQy631z%faK6u|-by_J014H7fmmC!7Qs4&MF ztbq|WL=Rh*#=&YcOgTFh=_{gkS(D7c8^})*VsP4Vj8bF0Q?4VFTzo;BG(i7+gt)oB z4M}+!RrE6-KySI&lj!4UhJ5OffkQJQg>W>FqkE+k4o4G?IvC64SWZR{)IbJ3SvZ@% zA_S}4Zg#{(n544C?Job7w9-Xy0YhcFU z+{nqV>(#E!MknYRSKzC;?|y_>|5Vr}Lrfp(ZNc$Hq5NNcJAQTf&!r>W+dnt351a5D zxb5Q$I&t>`X+@h%YnC9*432YW2CquR@Je(oTX%jB9oKCSqDJUE+%7@I62rI&OkmjM z2HJB2&-&4GTy5oz(_KyFiO2wQN(7njYFZEE>gz!&C;h9O^iPewPz>xb z(cd<&BrkwC8apojZS%5oOEzF{BsVnd=BT{JjO`3zzG?f;bsGNvJfJ`J{3EIE8rC*8 zko|&&MR-DX#;*AtnZET2nF$|epIhoqtbZyG&Lt@)IIDqDhQS8Jy(%^Hjju3^Lg1vV zQ1%%brq`Unthov830nhm-NP~i^M<*W?ad5~8$zAvB|XO3onm%@>|?gmQk-6TZz$_()Iq5%-63u>G>e=$DDT zm2n}r=ymRaI_DUeriUcFlAU3CFCH5jG*Bo4oIUi1+>*_C zC7WgY(mJ*L>11FO=xIFf=&j!qBKCxf$K?->&3L*In!NjPE?I zyEbMfeed#fXFq+jiZ6qs76wM$qWji>>-pcBv&BU0(N>&{w!`@WYkvI{W{7Q=2P#g& zjMaOquNxg8C)9tUPIMb4{R58icH`)e;AGfIWFPCRk>{yJ!?QqQ(NJNVOlh#mu4Pxc zhY4%dOMW(1e8^d3p02_CJg%mlu$}a_uIDn_$EA8e6seo(Y7O73xRrpZFOis){1S&6 zIQIEsJC1$6XbWkLj@0boy-&EIql9#Qw|7QD8^{;C$>ym9FwLih8=w|es}_4^+l>Pm zZ9up*MAgZ1nooq#PG`M@xB@U0UJ}vkt-v|gKn=w4gom6M{*2K_Jst*q%4N-S`z%hh zTO=o+iFML;86WiKeUcw(3qsd2! za|nMs$vYN+*Ath)p4%6ZvPd=Aqc#EV712PiCKo|DPASu3GEizu+8czv_39{K-aeLikxV9{ki^?%PJ;(UH$oP<^1aCe@ zUv51Mh={$@GBe~$T6A_kc+VdfRQoA4z6TpZ2}4I_71`Ph)Scl z8RBK`nlJhruf8QHjpBy<#@xV0;~3ST2PL8nz6BR)HOL&}2-P3~*Ntt)9xCS?>a->CyZ{m;GXfWCnf~lZh20_w;=(`y-}{4 zUSUH97JGW&y$K=r#^}ycZty&}F;aBG%kCl+TQLKA3Qxorm6aEGT40T4OnjCMEM{E7 zs2Y7i01<4S*ax}~gv5SPC8j9n6mSM4b1q?yCJq$ri?0E|h5rJioN_-$*P7p>Nb~Au z%ss|C;6!Ytu=N(@z=Z!4`um--k!F>%gsI5+^=-*NbzBxke8xz){? zBeIbn*y{4{!@X$iKX8h%g-o&cH%Eabr{Jl?IRqaBT=ew~63lgYgIkh-GceFg|A7h5jaxi925OI=GY_ z>&7t50ek+R**IvmAq{V(1b^m81Ev!g<2WpE^Ma3w*$y|#jA2`lxt!eNGhaXbR=0s- zdd4Q#%{P*DRS%~yc&F>ZH^{>Cf|S*WaqZm@v(4q_`K=Yl<11W+W~Jet54QB+B1u$L zlsUg=4tn-gkc^meC)V!L1E9B?nAgwyVp z5(mxM2izjy!pnh_+|{GEO&^+ww2if+$Jo~q>H9ovA;8&s!X$+GgY#X!aR#M=&Xxch z?nF;wiEeC#FAp+0gtKLa@w@GkFagz*Ex42tUydGOPc}Ua!cK9%LZ(xk_lQGvyEq$w z3%?4a6z2)BW!?tEZFUm8KL){tF$j7|@b(D7DI{0|#nT$3n|ax-A=~J}W`gc}*)1YO zmp>0~n_v08ow=V3$rO{i86<+*iu=oRoNT6h|UIC8{%MR|n5 zp2QH;z6JTj#h*SWz;utou(lS)uBGrxCQNr45 zHg>!YwSs#s{0>UdRBMFwXS?;hkb46%z}*+d&qQ)7W}~a=T+B>jf?0#@>~M+X-#AF| zj;Fk}aE-6zx5rk{QK?>t~jHSE~*nm?d2j>be z`Mg+x5Wa{~!boNQH^!^Y(pz~zd&RMo<`zpD<93KiZu;Qp@$N zPtMEu0(Ejc`wec4;-`+08mq^dlHaM4)m20v$cDM(jlI z2$w0t>0l8!6A$I>ungyxkD^A;9Ah(ej@Ah%!^y{`G@Ll| z8@u7m0WLfcG^H{M&^P8!#CC<-*{3pXfMW_JJ{8(!DT%vi0wf>qf8?9&;;Z%hNdw6~+j2 zVuos&%~8R#V^Xir|EsTU=R*poI?p4{q0S+|g$IL_ROdnHX0wjI6wP*`Y}kXoHXivP ziZMnjMwm@Tx|xaz^7I&tv=isJN%b6Y4n`gVF1!~=$w(vAV(#P?6nX*2E1+B*+l#5# zzZ3ld^9+|O+Z+WM$nO#=t^$e^Zu)DiyMxwDhaBS!u^5uYteGyN!L*B&khMP3H z=4wjhH6-jnG10Zy7ABlbJQ^8dihG}-TiqQ3^L}_c$1f$r;fUimNwzpXW%P3eSi-@* z&xCQZO85tqCna%bGxfsYpibVdPBH!oc7!M2;A)cR6ehC_H;@Bs#58Z501eUT-+6^f zl5RjhcdmN~@Ba!MMmdcxFQe0S(?Jj+FNh@?8HKLc`ttYCx?FmY<($CB;Q9}n$>VvS zi@D>y6&j|I@ac%YjE|H!ZJ-ohY-xER2E3(Z<{0e=n#m3UvXhh<*Z_`_!BGxxDW##Q zyx=hRMZUQh94>#D4oPV+v$YL$5i*DO+VuGLMwr5I_*-W{S4An6ELckUHUoNINcP<` z!ix03CVfwb9?I`AJSZzI>+^P-zO2r~?Bd_{6WW>oWAywBX{S=!B^(0}ifsakG13`Z zK}u%RBGQNmy=j5rzx&%Z5G8tVi*bhv?aa*4VJ-GK#t(-Ji3 z)LRgQoqC%?oI`IVz=fwqit$VVPMQPJ|H9$o&E z;G#@^3Gq;sJrDV1*Si#L%l;Xc`M-wqGx>#LT_3YvDw|ZBUb1zU|0ojX7sp`{S9;j` z!Liww^Jo8mAq3N%IwG+PB$vH-tGL5mhh zwhG+G0{jw92rFCUZegl)b{7lqONgQ}U~e6kG74CLUsNg|gl0Mo>FO0YQ)6scBQZR4 zKoX6rjb>XLCCD@R#R48xAm3JCEDP|91yWUkG+Tj_Sb$$FutXR>#9*q#;1kv2v1-AC z%77TnyySeg9bud)z8+=K2l@3EMX*SOorm=N3++zuUc;T*euXa1CXN`i1ccVs#NgOT}7h#`!#P zdeaSM3H=7a`U>9Q2@gY5AcLBhgeLE1({VROMag)(q9H%)%FR(c=Cjc#14T*uV04@w ze1WRT3;Bip@a$q>d%FO@h-O^IGNF(#pImUoF2k|GHnF~qbY_B(Fm7j#g^qIk=*;DZ zSQeCtqtY~1!I2*(wIiSjb`cn^~`j9uVD7RB8a|0?KIK7%$JL`Ba|K$G)i=aLcG z+y4d+;tWgp3b#v;cYeZDXeI4e##l!jY2(Vq?t+_CIkC& zWROKa{2|C0UqP*8{_zz^mF4b*gD{#fSQNrYVGtU}W(Tpu9Q%b!iE|u%SOlE&=tCoM zH-JAWaIxa$Xz(J=edxidn{3BXbMev`Jyf>!3hU7(oW1X}-#m!2q-=gUynw!|u^CvfVxWCo(H|RiHqD0U zSk@V-Vy!RQhHsp<-_{>Pb+-O5&S*j5-EspTcRUJV*<25wLUHwnlcQ(@G%74$!3+>(l^VKX|81*`kEd6jDu}@ z>j;UKMNl}YPs?|#-=6~Cd`Y(aM~Iyh%o)aKY;%JzkQUBT<_tmelxRY50v=YlTDw|4 zG;_5!p`b7Lvg`x)%Zjd8`7x@5)ILf;SK;*zN(l#` zhNb=?w+SUILB4TupO$Z3w?8Gk_M%k62HCP3>hIe!-F~FyTT#Pmxj?qOlr6u7(bHSz z!}r_rWYzK=zq#eZZ7ny-mRs2J<$YRSjAJPAN6z~%NSUXi<;>ueqRe1UX++x-3Y6J} zVHQ!T??D|)>K55{E8G4E#@$=$txWhd>K3s$?*0+0<;#y?rKJm zt{-tSP=-e|AcI^iG(KDU@{*x3=@WJ zt^YzV5T*0r(#evng#99kkS*^;R^wRqjsw}fl57ph-r0*RQ6QUYy!ua}5()c75+UmX zS#XuSTj`?R|JL_%#xt@7wN+`zGLonV->SvX38ti_uxw1_&48$O7C4YYEBFGh6**&3 zgP19r4nq$*_sDfN8@ziAi_2OvfJ!4CLCn^c$PHIPpUPs{2;Y~fZWitNlY8Zzi#*H- zO`VwL!Avvs5PqEIQj3wL5XJ$FDS{#s#91av{3j^P842A3qRvR@M&j5Lw$1Wcz=aP4 zQVMhlhXVvU5CX}FP)>PjP0)cIZMnoS-vA5p+zv*z{W;9z*h&>~(dRIeVzm4{xDJnq zwEVVGWoQ}pPr3y0CJ^bno&7SUEg=2q8Rw;>wt#wm*K zhyMV5CGshV-(@gHo{VRsdmzR*1`lyoH0Pkpom{;`oTHc50_W`IXNX(OULFd~S&Zli zPmGba=Gz$y$7y~gG~;akGB)opiY($B%083Lr$ZmoD3W0#RQ9H)q3qAGwBWk&NZRTX z)*{^a1=DPUbi1y6Ci>h(#pX@qVTS}aRwMD>%$3*O)UbS=v zSqpyG*IrYpv!3YE#kh-cXB?f~j9z!@?DsJkbH!lnXVeXkjxhECR7e#p`a2l=?prd} z2#r~@Xt;)aO}Fv&^Gosh`UcWD`3jO(hbhe=4-SV>Lfl``viVWB7=*F)e_jl7Lt>B{ zOx!ahcRR>Anc7W89lf}NI7ctO23(lO3#EZAgFdOO=}$pf{$H$}ivoZv_=eB$VrWYi zU4)wQ>Kx?e1|Q(Q_=*%3>W&y=H40e=D2=r!N8$sK%9@&}DV9r_0kL>8Uxn#7g&9i5 z9U4B8j5{=(KwK+TGWVOlR6cL+5A*wgew-;}Ld=DLijrC+O621*R z&wDDjy<|zs)5skRs<0YD3EuX_E|$FQn_`K_ooWUWQ^)~XI^Xa=>N(hU4S!2=Bv+Gei7cwek9385j8M3j9CMMJ<|3tc-6?F4Y@ zG_--lISg$#XoOu6Lwgt6Hhts(%L_=)Kn5}zv`5D}seTuts-#w%3m*;yzA?lpRe0)X%OqP%_0(^s!W)QQilLw#; z?k7a7letC;j6~kGBM60wB%-iC!**z$O}zc<)YzBEbk2lZiF0UdJ#gWfk(uyuFlRPm zGUAn2F2Y6VN_8Kt;d!E8AUYdka5-W(sa|5B zUXeuD{Ru>bdcTB8JJlP4EKc=aOPoW!Wx$0OMbtYE*2Gv~-@|HUE20glfS63wQ<9h? z)9*zzYx2v<+qbCWK86TVehHJ+T`XnYNR#dh?1dECBt zw?tjmgg|Fr;M-_N%zi|wIOOlP`U2wftpAs=?YzJ~_Cq>D)!-_Yplb^3jjq92U-SW; zI2~IKcsAEUXsI*SiB`a_I(5P?Iwa8|Jyau|twb1%AJG^y>4BI;dOQiOIhFYsagKeLC!)-$=y&t*2%mkMbNTsJe(`Gl)$=%CpN=SQbu-yL30IW$XrwfjHy*Nb z4pcc8pfNM)Cm{)aO7t)f-l_v=#$ubemb zE#dh1z*SqJ%T@?uKz2ww26|5J-be%wu_V6u%DR*C2i!$gh6< z5(l34lI>mm`kG%Rzdps6Y_(aMk4UUmN3vD%^)d^;#II-h^$&h6<(K17j}RbVk`jqU z%J>n5`6XLRlSM>g71_Wd{E|hIWf75BMIK=heo+ngsdRRvN{y@{601rztMH3e&P3{n zY;;C2Dfc5EwGuN1fm>dUVculJm4Nej#POM`DPd|V<;-)$u z()^{XubG_lAaDh2B)S84Dsc`oIUKm~5kSgevWvF&5a>z$$hQ3=v4p)5rfc%c{yvt; zQ}od7i_sijr1!IUkSKjO?p+n>!PT9>I^WP@%zyxT2nS~6`|3R5#ZbkHm+{3j;%UZT zIkknn$p1#gSJATaXw%=IPM)$}0~6PSE7^)33brDV&oNO>BhhLm+y>Ht4aPcpK}GjK z5w|(%UaIJt2!x|_mqzFUtr<^?zC_iqA?ePr(Orf(k>?jJ*(AR|{#Ek(Pw*SeN{x`l zE88T)!1FkNaxlw{WN7EJpnQD zx5CbGe{d6WSYV30U25#0Mc}4rxEZ{OlU?X2<7qMCU~W89$MuPjTV!XuJ+cGGfJbrG z>vpM6a$@{`v8BKa?5pDL{WOab!qF!zLhx94bXC#m{)?(>pH)y&*B^4JPhN(mQv*^u9!2I_2F8oKxQQ#9c^wUBB*+zx!j* zTNs0$m$*XG`zz=fQvk}5&}&3Jcn>XZyD=NR*R{RtLp50Nm@sECyh`lDuFkIZt_@x5 zy4G~9=DATG^HmcTU!^QOBHsb~vzWCSFl%w=q1UWcfhUGuU_+R-TXH*a68?s6PF~O1Ca(#BA1B~9YdD1^#hSu0tvT!j z3;%XifOdDTdB@)D&d4*!Qp0fDma}-dD^#<8^~0qw`s} z*s5LMV0#rujB}_@Q!K}Px)?a82u;NCB`^PBKD#moJv|1!QN#sFZ~ybzpcwS_!W^9Z zeM;Poq__Y1>}8a5%DXfMy+y>`NqYOA&#Ge3D~Lfao49|F-u~ya<6_WDib3yt^m+I` z(%b)h_Aiuk%G=7ij+<+b5!XU`oX_BqxD14w7XAo7h%p)Ya;(r>V}dz?W0;N(1L`s^ z0s2fZ58o~)f`g<76$C@UO`vO=V17X%Nq4cNyOIN0(d}Wt)`xDcqPs_W2t~IgLYEVa z@mPfJCP}x%M)%2>bich#^81;cZm(&@xHv*pt#uBuQN1)K)xRpLZ%3)}HP&KL!Y|_| z6p$L=90JWQ$HLdrz^{mi`)XjkqS+p$$ya6ACJ*DW2vyD)pju+1`eaP1--e|8FN3OB z2a1J@TIvL!6MB|2g>62FxunTY-<&yxXF!cZV7igXfZ+$->sGo~)y2ofWTjlOGDiy7 zK|M_jvKH!x)q>`Z2$}81+X%3vp2UJi`Ro%(2B&3bgA5)2GT1B~|LZ?lq=e(w1tD_w zkqYDNj{p5zF@hY9e>Mm^N7XdooTKVu;`VS<<)BH%cTP&fVo*wnK`D_qzAeT4`NzKe zd`=!6UVIC1PJY6~ts|8eVo+HUgUYQjs5B7A*Jp{HC8tD}$DlGU29*)Sy+JBNWAHLK z1{Ey^l`mlx;jN_d$zH_a{|4w}XftsR<-EXZ4j1?^aob61F-SS7_+n70jzQ%L;=U!7 zi7}|0AA`!!7*viVj;}j02Z4&PY_1rh;Yju^Kx1Wb9PENCi>_^5?{&Q^3yPpht}ov0 zdY>zcu2o$tyOwvo)b&Ewb6x-JTGsUsYkeCW@5YF@?PH8EEVCWgJisSUiB`6y6$3Gn7Toc_)W<`j$z4Ldg{@DpBN z8_XG!9r!5IwQOy6LTBi_{zEs)^J#%Kd}(lIa8MvL_yD}#He>HD-F4cUU74;wCpPbL z6>O^iFw?gdFPz<$lks-_a$sKB|lK=A1k1`6>jDzjIK|u$0~)y+|bDjKFP-sGOsFj8;GUYF%C+4Nrt6 z;r2v|-kz0=`r5{<6il*Nc%ens7~s+*ZVaBq$jaw+=AL(vJp|bW8?y?Cu8~C=G{`s* zc{=hM^mgA8;_=z&Tgn8OHm3OG`hLZWQL-xZkjUET;|$CKolNoB=;I;)eE1eK35f#8 z7sqZzKDWGBBaJC919+LDD9V}}eLN?|{Ct(a(N`ca0pC_xj08|=uY_0}&=4jeQ529a zp$3^(A)&=g+I=-h_}S=VHZjc#gNh4T223N_=wk)ux2b$cD==UfDT*0PAbTXA2x2sJ zIzOPeRTkyXW^MHGoB63Kf1{7DJrvT&Vg!+&AIUEe`QTb%Krw}7ASx2bpz`@RH9z25 zV4wq}D8>Sy*b>Qa75U&=VL-8hWkRAR`cLFT6%xtfD6&*S%%o5f60b5sPe^D8Lo>*Z zL~_DxA+5rYca{nHbR>{n<$F~AMu8`No-!lvOdxwCp9o@_6$V8sSO&Z!f$S>3Mdfew zwF(TRL5gAoQNJUSzg6UeYlcrG4pC)M=Ag%jM~zgK53U6!;M*jNu>kTtk$hd`xBH3| z1{C+oqO7^mS0VDD3Zzs86G$u}W&$dZ&|(QKlu#=Z$c}^`NXsi=S`>!7vrNdh6$xZl z`5h{Mqrj8?CRuDRlXhQsB%cW88%SMEjRZozA@E8lm5PKPXn6Uc^&mM!KX@gCZ4~t$09H zTYf2MU~%$0UKjcd2(F|3R5gPl@__kslS@Sb)r2%-GnhyDLqW7=k3iQg;nm zN&$)uLV2mUG%k`P^JN8HRS+7xhAf>f^J`>&h05p5v0Nuh2h03snZFQ;jis-BlBF^V zElW^65tgcjmX-pgSZcwg(TZBAK%WUI=^a@?u+%`77R!8kP-;Qtiw>I4{07x(If&3r zqL7Xx%2&H@3}Dry2bV^UBuN*F(iKS9B)e{Th$K&u`E;SkU#Rj=VtymFFhJ(hg(80` zlFU{-nV~Je6SZM6pWP@ehFwn4WI6gsJ6D?kD|6|xiTYzyeuOXM62%n_4eij0q8#0&ovQ-CimM`A8b?d2bg(E* zugO-a&E=1fs~;B#uIXTrPglwOKQh0AT)ijr>0pshAIbbESId#l(M9v7^k^I9=pb1R zy{#>OP?)j-lwzzKmqw!`oGL4%Ac+v}xdsgv$$XE@AEWXokYMs6;OZPbZ9gdfsUN7`GrZ;?g+&hqU{W&>PCUlaacb4#9U0ULl$^0l!j_ zx>`7+hv)H>Ndx>_0e=3e2*F}SJ`SMpg_ugsX|vTc_6Q(7Nt9@#Cm9~LC>8J!38w+B z@Br2&oDaC62ylgjYc#`{j}8s~X-IBpUMgvmv*rJyMqiUTI&wVB;cwe4-@=^dk)wxl zQgPeA4R)W|ep4gq;gc8Gog27mA+C&v&f?><^NMmadhm*mhi_mr^E|m3zXG~TLaDhK zzX0+J=#P5F&j1<#aGIz=Dphe0F|kyk6fs-$1NJ`k!2h2rK7`JxhVYjZ@&a@nMGx;H zl+Iob<+%4N^3$X3B0n?NF7gryDUt6KkP_JsfFj?3REqp4F?~f2ANqgN=j}Ok6_{_? z!IU={WQ|KC$5=(aI&{3AfO%6ad0OuR-t-`<1n0q+Ql6*V$!VFCiuCivm)z{tW2 z1Ha-oi`L=p*I?{+;IMlSUPaT-_>zj@t7!Pkm;rwym~u6}hTgz0DfzeIt^CR4liMsi z-$K@H%K`Ci@weTN3%vyS+|x*jgD$YDwI8nFm7leV3fTci3~G-v4j+ zUDl9^q_Kg-CvgNbAt&RD`3o}ni%tpry`K!Pd$7wtz6X0|a1MCQH9AP1*}9|w+c8t% zKwoG?hHKdrOdmJs_+2FNW`Z~B#cr%Wh&*{m;8Iwuta&7A;^%x`(@w$=?fr8;w?k(6 zb3O&QivFC>G%u>)ZrFvh!3g8waQ@vExs7vTFp_UEq6>s(ErkQK%O;maee+=9Bc zn{KD0u=&ZamOZ7nnVtJ+t#jR};S@Az{B)B7jb;(QAcOmMY4Wc7t>n(Jn>++9I5#K< z69@V1Pupt1g?GUMZR~tNMM3ICklxG#TkI(`D1OdW{gP)2+T>ngH_94+Cj&_&9M$%d zz)8JNAejLqgYrvQFUHTy-AaK2>Q{&612En}Njbj6Sp^I%J=XcUF*vEvRg22dRo} z?lF)|YfLiz6q)NVF@j7tRAO8lA!A4~XV}PG6O+tJXpG!sNp2Xr8@~f-A!9h@9>LA$ zT$CwpPK!y#tH?Ypxmipyo;r(+N0NESM&=U{gGTizb#sOyGeB~)gk*k<2vs4;6xhhz zACt`IXqiG?CCMx$nP($p7E3aR*~mlId9f{kB$qU4ZD(l z>uvNc-C(HNdh-BYR6Qp0>m)A!Sel@3y^bG%j``k!$3V~7ZZq3(+$5Sy+$a$1x+x}| zD`U{nW6&8z-1(qmp4+p(^Js%&(Ax_$aq{&kap#a;7aVeY{l1KHPC1vxptp!PZd90m z-7meW81xEa(90%n9O+%OUwX&IpqCVb-uLL}a2DzP%(-~DbwA5#Pya$Wr@XDK>*&|V zh|49tzk{BW%1tq-)W)E46>)i_66*m8PYf!=GC9~ztO1LTUTj5V z1Xoi!uGrx59dO2Qd_i!S%|i?FNH!;|Y;i!f=8VthT`_MB|(jv-A$S=`u= zyJJXGxk-tKiLraF|`-()TeCN7q?RTz%HQ%`o^P~muqUEQ5<=-;@1oIU%7WH@kkF_^} zkD}PxhkKGVge}Qc)Ch>91`R4xQ6VC%EGZiVN;NMgSoalnytFIJaYs@nCM5Q03P&mQU;SenXt;;VyN8B5SaZ`r%(BJ4 z88CZ*xsov65X+j%XXT>ed!xq0tT4Y7!F%Vv`sY+v!)Rz`dB(jnI^tx%v0>;l((&$J zY2hVJpzCf;kAo8H{(u9u@5X_3?bMD(U(ekHtp;f0Lk&%hO0Nae3eo$;Mk`rlemlM|BWNEJU@G9kgttdQ$AUFd3uBuZ(Io@3VN_O_<6MUUy`N zn~b`#I$ZPCzvpm`y2EX}?gZ~6kP5-NiB7Iq{Ou-63x9LJ<@dK~^tVH}Ph|zQap)K5 zZ>_M6y>lH3?d|?{3}AMDa}c%*RqOSjTQuJD0zQX8lr2ktjjR71t}pz7jFx@!{jHDV zVrCm3*xUhpyqr0q8~!+;072pp7x>~f4lOteo+S-BS+wA@w}`s1G-A3GF@iSYOCXdU zjL@p#c4*d#-V6PD*p1SkhO#w61u(l&zMBcy8#Kxs^v^TfO!u+1r$wl&5bfz@3^!O^ zpXr^q)9;v{l!j|hkAh})@t155h+vbUT;r%XivRLprU9`*g_-Q(`{%) zY)_wRA7OkP;A8#7UB?gzuHO_j)uAHo=6x3VcJtl~nBBasglRPIaiD_z(}oT6;ppun zaF8*n8NDZMxDvLs#iTnSK1`$jz7bSz2NlDmo2hrlzbQn+pSmy)$CTB&f2mKt}_X{ zlU95Sg4n4H4MOFZAXFTLJt5jZsEEE-^a_-|S??20=&r>8Xc4MaoEo}SJdN&*lxZJp zjm9=r9>PF)>*bgzD?ti4LJBQH7vWA@0raxB138ok?6r-mv2}DKbP-%Oim^?XU_M(* zp$Pd%nf~wM!`l16@KiEH>*#{Xj1jjsQWyf=zh!o?{$06g(21hPEM@)1&MjIKD0Dv5 z`9SA=ovS+U?z}6z_<(!U*OA`Se+7K0an*{7uPL@th@TAUIjdMv$>b&}10yG0xqW`*<{TIHFbP5g0 zNrQ3np+GIR`4_mGoHpQmKi*%hSEC$*?wetZL8B7>_7F6Jr2p?j&|P$04nfZhIr_vS zW9aQBpeALQkz>!RUX(+Ds04lJnHbn>MF$K$X`p#4bpsvs5|}F9_Yew<@j#XwuDCH! zb_a~#fXF5q=O4Z4Ue16Sr%ghC%H~ypfeOtY=O2@n7ZMa|5XaSm(47v)V$t5Sg@6sf z#KJypkWnOU;KrDq5REb<1Tk znEt#D7M#qoV*be0@;M{*G5^scbr*&Nk|qbdr1_{$bG%8@95-G7n!)4Bg0i47B~JSY zV+c|`fWeg2?)V-BjHZ2mDPl@AQ;mYQu7Vj@UPnG%;z4-63@q8j-?=w^73sYZf}?-* zudlgU{%H9HUnxsXtmJ+aN8LmyI zo)E7MKo{2I@>XxhEvdm&q|*|nRc|8@Ur1+JCd!ONwUL-76HN&XOyEZ%i)9k?)i4-8 zsKlWVClscU$h5eZixglj2QW@A!Fvz>_&xGfJaf-CRAoV2%&XQOVI5#_lgBuY18mju zgDWym^Nf#g-1}_fj)<1=@!A$qf6EdYP%Nvp7Wc%d(MV#;F?WodOq|ifti#G;oM+rmYeFCiOxXxwzd|B*sJr@&+LU^(0ZNw2==RpWp zYjvL1F%2BA07uN_xIAOsW4bUV5cKFQ+Ulvy*{AKjgy)XVrP^)eg}iBK=-SXlXj8Pm zv^(l&BULq;1nvL6pzx`7x1g}4^Fw+B3?ZIA<)mNTV(mUy!h0lNRNjicMQ{L#J&_vnm;E<{#$>ov~yhXBydmsiQ{Pw?tYc{QW`d0q`>3cQ zjg=ywK@It+XZ=fAt?AWF?+0A_9F>kG^A(>t(TX`;w~CW%Vz$nWc|Bvc?j3CGle2Yl z+U5-OMVz)d8dd74g?dM`HbFMN*QV8NF0&b*7 zSkj{{>10dVrKh#yM03->$cN8mk^u?wtal01nAG`{>E&3}Gl<$MWrZQCnI(FKTWDgm z1uEfcU8bLC_I8`{PFxrLDO$g>O;e6=Yp&r zQl5#D66%tIwo%JMjRgf1>zZWxQl8euE4UtnL*FYqjR|0aG2BE%SbTgAQ+gQlWhF#w z5x`|*0t5y?hmIN_dC+tcV$zpr=iS7?59pr>bilZalRxGgyX%=gWPyu4oWP+Ik$e`M zXdvAk>~`Xs_=FoyBzWphv>b55iB=)aAMsaP(rYZ~wU+coOL~W%*4`Fv3c>=wZ?2L7 z3G%Ev(QYPnKAiXFk506gsHl6&3LB6Dt}(R8VFkj8m_a9!XK72=oIRA?kM`((BvBV& zEI~hdRuIB>4s}jsaU|>b{b-hO0>4eElJKJ+CRzMQa@2)|;${s~ld~@rel&xARINS7 zEBFzFljAe#O1^x6TYg|4@C>@5b*I7(?Hi4VuC)p8Q{5jq^@zGwMpS%G>SC-Pq%>T| z^@??>2&i-Q1&ckoq>1$puoUrB4lT7W4p5+H2*Fhl(1kpLb^MJYpS2Ehc`U7~BpSsB z@Fah5)oBYMfz^O(QODHe#2k`faK$H@A(eJF0q`U#A_4r--tX z5M}SmmQfZESO@Nl+P6kozzvt$O&t7S%+CZm#L;vZ71(b{AF`wsjNAAVQ5NLsP|xbh z@5D9BlC9xTly0XHWdS!FY8BG_5r4HMy~dIjPqFAY8!hl1dRnUxEegT{K(j6983cJY zqAZg-4Gn4jM3f~e{D88;2Bd&%RD(tbVU!gJh_d1=Z5blU)(icOA}=QF7qsA0S$Ztk zD`@C)9>G9J%@EP_DJh@M99Y8O>`JN3s}qG!&7g58l{O(u`jiwA z*chBjpK2pF8E(_X2xw&k)oAQ=;wHt_nyRmcl@o1DXZ_di*5C*(AY>82>-*Wb|KYmM ze!zpS>m2n9Cq49a9TgY;`$2q*QClm@_3XABbOuJj$d}7OgK?o%0#ozifR`W!$Sz-k zh}%VUk=klu0d|%U@*}$z0jSp)sMO~p_8Svnd2JzBbgt;Uqx1I8TRU&*yvgNRz&lFr z`!cp%q_SkoZf^~4p}ezkAkGNlBukw7k2*hP3XPN_ELN{E($bEdAPVbRc+X*R;E8c4 z?f_4lZ{7jQ;acYl%oYPTWS6-EO8?6HTkbld1bLKThLj+-8Dk^7S(VhKAnc+kh|yJR zt%0N-ClzoKV;OL|819wzIg+t|)vk5xSDq}LIFhjnP6 z%6b@AaSeUPwdo5r%h3sUjBVnZUd`4i-S!*l;oDhl9(Z`?`*@~)i}!o1J$f}9MCCD< zz!aa&v6zA}#CyK(XCj~guuXhAB{ z7O(2?AzryByK}8SzF|h7MX;Qw;cW#9W(4q7hsHM?S}j6=7kg-Y+VLP~uJ7aB5AK#f z3tS$ow{-N}aR>CIVhgF5C-0D}^-OHu|0>GmzzmL7SB|Qy1`R8}tI-vtVp`&38T40B z3D4EZ0OmIcE$?Y`E-id+A zSfCO%!rizvnJ-Q5$2lKw0>GSE;)SKg+br|b+$zS`%3lK)@?=K4TOenZ(~U{%jt=PL zYn|qBt%U^meU3*l_PEAuiCy?78jJQ#Dk}&>BbR)eL-QV2Ye84tW$-85{Wx7%kHTvI zA(q?@yLBYTGwEunMhve`-k$6vY_!NuP9_|2yLtv-(Hajgs!xdY#n;@=9klQeH_`+9|k7kuMW5c(^^{OQ!~#l*Y_xhu9pp zTnLa3?;Erc0SeyN=^FEK9lnDuTm1;g#4i_&W7KiI0Ag06pPx}SZfMop> z!q1LF+vNukwnlfGKXkQ@+k*#tqFjv~(c*Q|BR6)wzwUSl*j|W-U}amM3i4QL!uK)j zbx1lO>jn^OKCsxh&jLg=?9SUateuylXh)&rUrWF1TP==GDXql4>1Per| zd6t<`tVeKKRK5Fkr-&||ws>x=>u89>36WQ6_&m4q6=#b8i+4|HPg7%$T7eSncNmg$ z;xM3WoV_fGX>(=S5iI*|$g(Hf%d(%*#-gmW*D9l=b{b$Q>`oABmfnTt$myODBfrPn zXx!~I34YhPTCh~t27h-)UV?E@wyewKo!S5n*=SZqU;m;LteNXKEqlM`99U6 zghFcTt4`PlZ)uZW!%df}=mhfk+-#hsdI9iEN^of_#?ulA4MH%{qNtCnMRP||_r`f; zu=~fTOPA-EBtgDT$u) zWiEV7B+H@I9feR=-%lFT14X#+28%IRk4T7*UBvxpxWt0YH%Mi3K_)dWws`{VPYl<1 zF2t~|U9nBs0P59TBMsGwv4YUlZx-!gDkyYr+=82sPcgy$CD`&cLayt#F(jss!Sipi zH(tvT5nO<8T8z?H8?FKigbLex7A^8#WXj7&*Gzfu!-YlOfckw7d9r?Q)9d$aA-4L` z%VF!&($tOmd{sC?Kz&B*Ag62WxCDHN21;oMF2Hc03@Kw@okzV8yu>F%P2UL!4Wf1} zVFl}1g&^!bu`6IafcZBrj*WcV)xufY1)}=Jl9y;q@H&9$?^uof*46TcHUN^a(IE#7 z&N~$3Q2(*dIk4=7yNlE~@uGi!`vQFu|5Q7v8);HyHAjQAwiy<*(7j&LC6nmb=MdqJ0ueWUJoQm^?bqp67|D+Rd>nv)M*XQE8^9PPM!Ux5M+6;LRi%H0MdDjfBc5^GmK6Ss*$I~~8F*{mhl1S^gDD6#Z=NUEvj4467 z#-&w^nHEH0FRywM^ribaCs!h} zSTn*Wb&F^hlN<@G!?Gb-sD=V94*& zRmb8w^2Nl@64LXiw{E3zk9m$K-B7UgJpxKI_m7g*lf{}r?Rie9WnRl5)T-ZOwEvQu zXPKB-K**FYr%h|o>Rni$rC)X*Us;SU?o^_*mm!jOUl+pwo@?gaOO*C3Vf-kNwg=wp zZrR$|k1h0Cq`V7ZJ5$9$-Bo;q)$39Jw}mS9dxs}eMcswydY*o3nAKvc*so1o({U{- zzB*b~#4R)&Dr6w88I2lwO}Eloz%Q$1F9=>BSYAllmq=Ru@st5C(OXq`BeK%D<3K606MgQ<|W}6HIhxu6Uuk9bOGr?_-_C7pH_}MkE z)yRXwc|nBZ(yN&T`F}oFoIzDfJWs0VWT4=)?Az*Gu9gq7T0S?vourjA3#DF1g>&SK z4p82nDi0OiT7}og%o>3m z4lv+oSIcl$N2Z9gmf(-kGHw{q<-~2zxE8UY-`);LAGkRIb1hlML#%A51M;W_fM^=c=m29_WVZD99OWtLLO!ApIG zDN7r#QI=9Of|q)-d#OJKdzix<(bdPo_M!ck+8t|+0a?dapd4JvHjeDe_0eYuxauJtw_i}=b~TPRSwuNeG+9?4uW_yHGf>yoI8Yrdh& zIFy8$`)jpq{xH%&4k&Pb2S$^D%X62gLpbCh{TAb1-0q>nWW8JbQpCG;!$?2gr2i3Z zPWngd^w+Zk1pQire&^k+FIHkN4r%seai@j%F_0lV+jBt{+An$?j7dknj1UOzF|6Yq zwVrS`;kGdgFRx0}meQrM1A7uV+&dB%D3;uQ4ytZlHV$yVzFBy~RkBusjtahDegYuD zxGiD3kIC%UyV@17u$S3lUp2P;J+!`oBu;8=3LaWTY6G<>$t6cPjY)M z(5{A_x-&HmM|EwW?d5o(2^C_u*W+L+w#s3yLHKdQOD$v(+svn1BojXa*8KpJLH+$3 zM!*MOI05+Zu%92QkA{TYpzOe+4=1hKb6^Dai<3vN7Cvpy$Mt$h<8K^?`?3XYd9Iwar(y zKR~yDqa@2cdwDWm|9~oN{=V*F`y(0)p|kzdZZ(9#M%Fi!Tn1SCW&{=4Rxf}vQj%X$ z5#m{9?w3wW7(~Bj>5ba5*YgcOp zVK%iE0oFdpRI3a=L$#b#%LNIv_ko~Yh(qHaj=Bl*Guc<5swEp2GFof(H>|8cAr}53 z3BH*C9PN0FkGKWpAKw82XIHeTD1cEwr&E4ecZx}=3s#aeA3*WW4Wb6w6`=mu86~mG z;ch)3VdqN}CRWZm_5ulSCH!vPu{6O?rU@Q5DdvNs7rXJhk1yE!81Ho{Ynj{Eg@qJ! zIekTR;%)u3kuNuPInd7g#x`9I+ch7&_H=j4dnqjk+EN<-bhtgUF=Z5i$%%$*lwmTJ zHl~EwLx9~*D~KsFp&C1tIIX{~$)DfC^bcU~B}ZZRT!3mX=-d4EwHbIJQu@g6JNu>J zfke#wN2iQ=@0yXg6^WnVIosLruVM9Hs+t68}wQ-RmPJY?35#}Z+&egaLLkT{$Qw>lPUROT|S#;@SqxDsC7{Bms5TH*3&2f})$!7Be{7%W>iSbu$f$6|&McDwHk zX*WJf9^P)zCYOtJyB+JFkN6w*8=-Uj(%@*Z9|+qFIDVL6K=XNs;Q*yK;X&FRpwP~D zKhgop+4Tso$rDC6*m<0P2rcWE{_}naJKTA?w8PY%mRc1B4FBOfBk4bi$q4>)5FLu} zpAPuXcF^Bt`p<|E{xd@7^t-{yEV=;OxC6W=9j+$5$5eh4KtlPkRGx3-r1BNG`IO2} z)RjL<#!FEz;q<<5sMLEf+O4R6)<%Hde2FLZjy8Fq{)OJw`Ztw0vi>^9eFjIf=z?t^ zY%{@lc#F*h!vRWfIvCr$5V2akN>0>2;_LrQ{U=I0{D8I?+zxz2ov444(f_gjbxvc;A%tM)##qC;(!}9BIC5|eddro`kBkVG6${b1u$ZjeL`&WLxS|y zGRtsP#xcp`H#$+wV!O9El zvAyL$5b|;j3AzY<-i}?$&whm0=ac|_u&n6R5~WV>dQnfH+5iU7BoeR2K&4Rdx+`3s zM=3O)dUtiCqrv0Goui+6Om+XN%>n_}Iuz#bI@cMKu@h-$>>}}YsFuxmVJn-(=R;7h z6yBl+KQ1flN!X{Euis|xy^cfN_7~9J_4_^_a`BuA?Lj_n0h23~ms1kP(v%_d9TjNI715X-%2H+}a|T-F(OX#EV6-)EjZ^CSH>{6mBDQG{kFvhmx|IDg^4x}P5f$9`f6 zdYeAlTKbtm{|uBg15Ao|?h}KQ*ydZ&K*<*`fCoi~_ZD26Q8-1;2lAn(YSAEw$J;4~ zw-VRu#!!en1QqE;BOrF5L9b!$dQiIGo@UIzbv7MYz#GC@QKt8748R`s0ht*W6;=Ngx1dYRni^d%ix+$b&7sZ|OoInsmWiw?EU{rCi-waUgDrOEgMu)R4#NBZ ztlNGmn0H<;`dwJ5uW6+S-AYfM7-%JpR=OBB*TT5&kXAw!d< zMtkNqaC$sH;kxR5NU7UEd|g$4YLWdzJj!@^F#0=7xw0_jQdo`$y_{Y8k5DUi>30BT zm;M>TDk=THK+JzXXqA8Od!gkYjiB!3A&C5|!<4(9^7CMpo9Lu#aR}EQrMuRzg958} z23_ku#9rZA^TYY=neYsO#R*;bZKCP7J#^xC+K4YF@pOYYUJen$Z)2MnQngo4kPMn8 z;L2|O%fQ`k{Rx2Ct$!|Ix6%3=_3?zAQvV>7lpvJ8gE;N2q%;SVu>NN8Bj>xJ>rZq^ zJcRJ0WuR-&>-fNWm_|8|GUPlyWP6nRrnTiz6B=QYh;9CPco10! z29dQVVgIG9H;FqdG0*3Zv!mY%T@Syq9(aIaqfNzn>xwOn4^(V96&u5e1Xcj=e?vLg z^>nZeunQHNYgcRvG`3g5c);wHkVM#XRBSBZ-&-s7j@zRIwhMrovrKi z^|66E#a|1ZZo;khU(@NhP&)k=8FroS1kA3}Lc+FFr#PEVFTN34r;n%;5723bsZ(EF zr=x5-O`}dPVansL>6D6U4q8tiBEzoJcEId9{g1FWsna;zi&^VQd;PEJRATDXPuJ;R z$B>d`%uk)XxCIkBop7Y~R0Wd+)u~4ioqhtcU8he7+eMwq4g~7d|1auPMxDf9gqwqT ztMTvtfjVuZPRAHJT@^A&>%ngYOhTQy1;0^{)m~47gXq+ou)Wl2+0TJGCBKI25aUbJ z_q27QP9|2Hm36GH*iA|-4Kh9Vus&q6R(3I&cf}S zFhrk_HC&gcy#yPwb~Effz$945Zh&OK>;^cKupeoF=YAC8%`?S|)y4DvBT&2<6mJ=3 zxP*9v!_<@Lqz33BcI%{`LWW($m4MkrY$1$mGTx(wi2jau!^@B?gadv?lk?ze^)$dk z%y#t;U9F>RYAvT)FX2{KsC7@`5&GU#R2nVcO|1`+VOMKAV0N|sM_449x_8`uO6gZi z+YwqVej7x7Af(l^*s9i1x>{r7NX+j{JE<1u(}Y^?FlxoZ(Xy$fRV4)@9LXpoEFM=FtO1G zghf^CI~7>19>Tjp#OuUzt$@M@jCs3eFNo>zaI4O9jr@sZ#epA81#D+tJT9BdNZUB z(COc1qn}6mDF*$TaP(h=>cT$PNctPC_8F+tKi5XTjr2b|K`%LeXc+&z!k}L)>F==8 zAEncOzfYijc98z{2K~#z(eGo>|6bDHX|>NFo&FpfedPwwKgpoKDjfaCpt`V6FJiTN zxHH+b{|P$%V{PNDC+hU4*yyhzeTPB+(s1-W2L10Oea%YWsnb8$D^Pxo^zX%8u#oJYaP-eG z=tmQ))w5rxFXF>VI{n*h^b;FFKgFP59ghC1P+drIwWNQ*O8;b?{<${#Gf4k4ISmlo zoP0(Y`(I(uFOu{RS?zO*PXGPrK>I8w{p$_-mxQC=$DrRS=_}}GP5s<39-#p;pk5^=wBu2$64h+ zRj2=4&p`X+k$#au|AI5a*uN8^3HfJ8`u(l+Pt)m7vC(fJeTPARdN}$XgZ{UYe!P`_ zqE7!{k3jj?kp8{$E-5jH(NAm!{m*a{D@uCc3-(cux%PF1LBCMaA7Q0GQm6mE8fc#xq<_6Z|J`0; z==U+`p9?9t8#P|+ay~;+=0@jucq+axVq?)p7K05IPlZ!qHMAFsHpNg7%W`Hr{l>$G zf*y!u3VZm`_PD}h@W)ryp|JK#RG@-_`zqWC3%bvTQ}A?y`)PQv#IN9)y6INfD6gen z83yH5J;Ru8Crl#vUk1w1D;bi?W!kS<-$Jj~A_Dc&+2TW-#MVv-#(f8Ff(6}c!dW%n z;C_2=t6sx&y|&PxmLai5y($gL#o_ci1fc}~#h?tmT&7+zrd|gefqLm|p%<~W^kCfg z>oWw+<>BHlU#m*Y!HuM%hKZ+HprE%G?l6FPB02U!>(( z&vhy3hnLa4gkEV%V14Oqp_f?2x|M2MtSY!C8{8L%)9V!oE@b=?t=r$eM(BEtwNai& zy}p(=>#@zRh109rp!}YU8)c?m3+Y}$uL@rmQvSH1vxQ#7)?T987QF=bGK2dYeZ$n( zPY_(_HB_|jr-fBp@f3)bi1IF`UH}UPkA(2?Iqe468lNb4_;Umtj}Gx^>k{O;e|?va z8#sx9cYySbV#Xd}A3h~=bPcGCgFf8&_cPb5TPLQOLy>b0|EIzrFr$4nEQw`g?GAm4 zp!wgl53DkSJK!wkc^nnMCli$Izz0v3gQ%!^FBV($@VQ*0Ap?XykA z?nCW*&lCdtr@p>_mb#e}E0%em*yf^NQGf@#0A)c0m_%l_wc=#LK1DS2zVZ>+v~2M{ zBD~(uzA_B^VZVbt57w!=e}*ZLbqED`Ag84aJeE1Ws4xY`Qc)hP1>SiE*C8zPZ;Hl) zwZMI03bar(9*pSSn#D)X@K7c8=N-qm@W7v4*q?{3w|gNtKC;~QXCU>oFx2A@f;taY zagGjCAn!L6;K5qp^QS3Za960Jg?O+Qcq&W*CoRN-wLn9d0_RW%9;^j&!xSi?4m?;3 z3=UJEkvj0e0u5+!Y}?=e57&Bl(YNd{!iW~motkpHX6wW}vxSe*TX^d4GHh)GdV87P z!hbMs#&0o3tR#jOz7uDog>OfT(CYB$q-f!M#>L*idcqm(E&OK`u($9}0c)=}TlmJq zUD$C5quc2{&$1O1)H%=C;&?yH+8XSQge8KW_xKN_T{a@7Sc5{-0WY5XdLFuazeUQ< zC?^P`^Mf!tm9Qu>TC*FBFmF83PAW18seRyUmw69ieMzcRCxy@2NG}c$d@Kd;`VE4Q zL@FpsJW=)#B_@b&HDF{n;$=Yunn2jG6zJVuA#^(}2&tokkn%y6_Q538qLT`)TcHNQ zPp06LkqWBYIHLFt0c8|Yc4Oa$vUc5WAak38R1)?RBl@BDC8Gx6;CNRm&*JS#cY=vN zQsC=o1Z<%Wa~f}biEVo98wgZMf$l}hE|3~TpdW#37wBWcey2dQC=dewRCuhN+VjA& zt9O48YBv(5k=pU3hB&bbYB5f1`>#}&*Ls{-gQSQPISlSRkiEtZ7H^h-3lf(uXd;?R zWO$%8&Vh$NM11sALt=&(@KxFi;!p@PQeXZe!^8FgrT%`9Jm;;c0N8R$ARUj3-8arNCR{W2w!}KdVJ_0 zP(DBk6EO3HEc)irJYAh*bahy~J+1{$N^j9N;bs$0=r@b+c$!TXu@X3#B;YXeKj12c zk?CN-VJ3I}*8^x*?LFk%Rog_EO|>TgYk$#HZ7C|;8&B0(*T7|6fAF|emDfgHBPr_o z9*+8WFnS#Wjm5FS{)&Td7=*W>zxVRsr(oGV z`Y>~OFxXPgk-D6;sV*l?C*&ObGYarvvJDjWNW2iXAWyLFM{4q|t^^t{F&? zNN^R$6Hg`HXyTcU<~MKCiA4GQXP_K|l-;-HqpV%UQo!sgUP##aRB_K+QlM^~T2c^F z1A>t1LD+bbYST#t*KOwK5PT>FzXYkEx}_4O2T{f#W!G&bGVHpw0A|;%p0E)V$hTAI z=KWkmw~2gMK<*PiUvKKC|0pBZUIvxb`oQSD2>oVz5j_-jz+6VI{Wu!~BYY`L(Wd{T z4~!nim0jE)kZBk9L&8AS8o{;$)_x7dl@aV=nA1D*M+nY40-&<_{IpuP7_N=xgQRFa zmtr95&BR78e^<(j1%@tV<-@?UDJ)R19xH^`!(O78-uwt zMjXzDF`k7iS`L_qQ6g{h?Z)^FnRa8mP8g_Ijqw6t?aih!?xZgy?3c#a3sf58tB0ko zyw;7uni0k*XUyY47=utgzhUI|W$(v>#SWL4b{MDI;TXKniZ}UA$kv{ycV`=M=Pm5e zO#R^$%%vSh;2dCw445&b9X>#&-45Fb0~M zbvv+9gdN1|wt4XTFWx_I*ddCy7d+4oUkNLDqEOT{#W}hudgDDQG(|Ep+skxQJd0ap zVTz;(X$t1j6nTP#oaqrRF&35#X^J!a&7piiZGezs)qoB6x`RA3s2Lv=1k6 zX^OWWl$!EdHwEiOnBoe$1P?TYLMU9qG}LeCP`y0R6rUj1KWq3;-3aRU(g-t<+rCOS z!WvAP2qSO}kd&EABUIxo{Q(;m&WHU3*4fPUtQSb)X{tPE#V#aXJGYZ5%o<{zkol+t zE;<<87kwwWHz3R8KFZ*pE9hd%iDz(c69i-xaO;q^7Q%}vNWzu9RZfI>_9{4!Fi^2p z!6|^XuQsb73eM?W`kkx-ip(m=eLz~6*LoGu6ru_c(tKVXSZB$4fx&tKUh4(&i~^ev zqW}+fiSK|qc8Qw_LrJT|R|3|4w<&QxtnK}%Q%dM07x=&kgWE40fbakx=scG)HXw^i zU3I@K!fRbdT39G`FO}kfr9_uEgIOSK>U|dp^FS0amP*!9Oikw#g$Mc&*^q$AV>)wr zAOW%MDR|%7dk?DmbvTk&L5id{>ZEq^%qKU$`4A|N66Is0jzDi6k*^{08X^aycRNuo zCQ9p(>HV9?!-za16unHM^d`zbk4$d>kw5+x$Z?_Q{fb!uZ5>g*=lno;`@KTsRw8c< zHJR`*QT|Pol}Dy`6_Ni*Lr?{p%6?*+0m6uq8Ad4(v4cSBze9#Hjk z;%8U+t5A1h!z-FE#J#$gha(v15zy2sNc|pC59SaHiQ*>80-FG5hY{dm3c!QCr9TNu z_LjZ^F#Bk45n)I3KKLIUQjNJ*<=5Wh7uT-*63FT0!Or`vAiPfu!n+S)1Ihc=?S3|F z!G?jGs-c>87VfmYGo_WiKeNNIRL`+tB%DAAd9W7v>TW+TyGrv}fCsxuGa!Q9oLPX` z%{iK|Q>fDBZQZF12tuVt5Gp@?11e{b$^uZKZzTrxnS4xLXvYBA>*6E^@Pfy;(~0-} z*T8cUFPQuFB+4s9Is6__ECzmdrN4;XUXRfIc(B`R1cb8NYY?@v%`!z3b~ZJ=d8^bQ z!2QOA;oVB!JlJ_Z11;>lR{>_{y@arTlJ~LX9n2~}z6+~7^A)V}4N@?0H}|{!E`L$G zbtY0m9<1h^8K%HhEWm?ZrOzOOU8UCnv#az1VHs2@bxU_DD}qp26og7GVL7C7c(c$t zg!{eo4y<$Am$1&uNQHL4$B0)+ynBfk%+QO8GLb0P62;~J>Cw8T0q(c%&d|J&EU> z1_hxK9fZn(&q3u%Qn?mXLPx(zm@mFTI?l(3J?;FXR2RrZQpoN|H zeSq0{FD2{-@{T9(U{?A0O<3i*y|9WGsbJCXj@$i3?bgYrggjW?uPjV~1uVdWU8OG| zf?cII0kf;Lo-ppf^QO0Vr?N5#m6jk>>Iu7rRD3T9twTh=_uhbYmVXNCyoyxl==TKi zs)=_$@q!upKSaqUN&`_W?l<~2e-XQZA4iwu<-u;Rr$NbXuX_Nq+si}P3Tk?cu4#b# zy?ASA-fiT~gPr$zL3p1Mgm)}q50LjAFZ$U8M8CS%VU_bffmQyER514&5JtirO2~uN zoZe-AUUqX{!vZ|mRVsl9c5~(dW;f?J!k(Z?+t+ueGB^m8-a)7w{1{Z8C6xwH2_5~W zyawwW4`go^QlZ^1iFiML1iS>|1#`bXM0tZK4x(7x?}c0ZMeO!^0v)xN2fMw_hER5U zIjNN``t>F3C2D%xI;lZ``~5Qv@0-Y*2RrYz(8A990l@6M+X&<8ymtV32eZmAufQs6 z_rNNhNCk7h6*v2f+O3mI33;%Z^WR|#aNHvXKX#SAf(Uk%-U7_7(o2NBNtH5R=x&&M zf>7}Up)!}SU8LdwmC)|D>t$Hy&JSUo*O3bCeoqpwhIkJWFPNd15G9u=Hz4JIUy0>= zX5xMC_^i*OINX%;afgmY@ksI4v1pK)a_T7{Ci;^j%>2=2YMhx$HdAS4iu+zA?JP5u zXQn2ZscB~F5;HZ!OqG}^?q8JjD$G>1nX1!MkeUms60yNdHJPc!e#AC^a=Aab%AZ{A zPpAQ9cJn^fA&r!u~ihW;9ImC$qSI|ShUx~{L+u9`IGzo$wMZpf@yC4 zIu=DCMKEX@A51}wn%R6O1AoZo*E;#@SmZQQ31*64*WeF`BaqZdj`l;7{YjTUndeVV z^CxHclV$#7wLdw}pKS9dm-~~e{K?h+!3p`aRl6_^&G%6!C#S5%&N$4IKe$?) zp4cHy&uUbtC4EFz>r2rn;c88v=W2avIiVftqj6HUrza~N=|YQ+^hubURbJpbw>#3O z(E=Um!Uspb-H|>+SfnMr3@;Y#NUxS<=aG(+Y5k>&9qHIIE2xX70y@&;uEyo|^b-7+ z;(sO!D0}bV3qT)~ax||327lX|DLgWsSc{XZqj@7Q@!Q@^b&&DwZk&*@mzVf$Z>ECC zcgGN`ViCGGC=ZEs$Jq|(vc#uI)UoH%rd*I)B;dvl9CEtaSM$dgB&;-%V; zZG`2u;qP- z#MO~Hf`~Y~^mDR)&ePA+OBM~=V-VRCdF&pN1`X=Dx8F*`T_R#%Vj^FC=m4ygbizBNA-*XhA)mOTP zc1~=~>Io{l4cK^>lU+xo_!xQX)uX3- zN1=viwuQv6B|Z@KGFl{Eq~+7;-iwi8>+j|Fy|P+|%*l5ulU=RDuEn{)-Et5wUC$c& z6~CL~+H^Ql`{oVM!2A52Blj5ZXTuhJy#F-~6nySrX;6eE<>tirVPDelzt!kpYmg&Db%bJA=u_^dlCWqejT;i>;Nfra9VnX zJI75fR*Evy3Q5V&EOh6jrBAa1yVVgf?JDcckQJcI)ZF}ug=y~W^qc~l03s(dKX-Co zH(DagJqZT4bIHug9iIX>&d4Z852%OqDd{74T7(%Lhf?#kjMC9k5W zZe~@@Z0F3X`pOa(5bTnisYR8QRdvpyk`iZCwIHS>IkQV=SJljS7FARj9E8BVm0@LP zI+bC=%1SG$m9+BO>WZTI&bqSlT4#ABsS9a(E0-75mKPV+*430(&N~16qU!3>$`ZvX zekiQeTyDdtDFvb8QUymLq`SmfTUJ$HQR2M3)Hxyq*jZPFh+bPXtF*UYJEgcHzqF#X zsJ1k_} z%v6lLB72^0+3fP#S{T3?kbkmrG74AB#DBdqM=3~~=$unpQ>#}Z46f*x7OhR0m61s! zlw4RdHOo1qBxy*Al9HV^EF~{*80ZX}GcRd=(p5>dut8>Nol)WLv|7In0y0ozW=@9i zD5_jhRaD}xDL*5=Wr$c2yL}3rv9Q@V@fcj0LNIW!0J>gDs!o4N7?o)0*dXkPG;7{Y6 zrf=v(4Cy2p)J`x;`K7g`b#?TZSv6Jl)y~@Tt4f6v7FVRA@z=qnXU{IGEV;10w7%5T zyr6t`bp^eBPE|!c+i&4SQF*mV#TE97vs9~L7Tu7d76bHh_U_fIm*~};S-F`sy}+>> z{&H5obV)tSj896f5G|*wvY@oOsHW)h05PR0jM}oWR%ZE~*H5cN1{-P5T0b1f-C)HP z85Q-lWqza*(dL!uhye=_0~Vl}U20xks$8jDSzlgS$Cg(dHzH zF{|tAoCujU<@1!X((+klb&5-zf?~{8W*607L7xaDL#RQe9AXt0RTL132NIfFRddBK z;ZRCmQEe@{jo!Ljuujt}ucSBSqi-2sKXWG9tkI&<=am-2pWuy#*Pt;e&hom_8WH7d zr5L)%LO7>jEZuKX`Rvlf+S1~o3cFSRaH}vSD=RlublBzy5@TilfQOM$hC9oh!?8<7 zZue0zhOQWBX6B@)VW2v`Q18+G0|<_&3JWIj8%%}i`T4o|9P<{6;Y0yQT80#opE2eN z7$2vl7o_I9^Csoy3$X%+1%(B!l>Br#(#cQ9cp^OqgL{qxh^G$#@=`E5<(NdM2w^#l zwbJn;q~exflFSe6H4C!~i>oT(KlR0R&dU0V3Mad1(Hlvzl;LGnvrC7U&Y4wOIe$2u z55C84;H;z(!)uEe6l#Z8p|2>e7>>V@N%Mwx3l+nYk^=t9fm~lwIvivs*OgZcPtD>f zsjRx1EY0AXK8HP_-YLv2hfB+$ODcvl=VQkt2Q)zj7H2whCTC?yzbCyRwdXqv3#+SY zor!bH%8SbwXNO!>Kh*gzF{BvwFFHFKabT`a|9oetUd}d-3W0A4;r0STex1IIE&g63 z$&L&?r1IyV@9*eEK+uN@)QUAX6A_~hFBkZ$_$5{K2$(GJ*XS2j!m@Rxvr20OlfPUJ zA)=xRt-PeDuBdK)H8=_%Lw6%$b4}Ikl*$rm2BVYLLqu;UnBb?^w=o=&!B!tk)+RZX zs>+J_&XW;)Pj(`NoQwc06NthmJ8SE!tE+0zarX|2XUR7B3>(H+>_kWC95zghR^+%< z4nAhr*VTjYd}rxA_;PLe9AQGMoWq96@yp*PU>!!leG>ico%grKsGRKt9lY*1fq1yB zNkcqxr}>c~S!n$1lxbp0VDhB&!pXSnEBG7b{I^}^4O?LGBk;A|(OEc$mN!pC#C!z2 zDY&zSrc>Q%lU#+V2$}ycM)%u+|P!_<7tSp+Q@nxBwwR9YeR z6RvOJ7LK2_oGaa(>9Tms5zC$Rf8?i^`@ix_OV69+`iuI}9{Mbrr2<0Rk@y%QVFqlH zK4D{J6s8<}YJ5sTdLbPW-i48(rC^wH7#gOfOiC$aAeZI3d1bhID8Ps{3-haah}Jn7 zZfmdz8{cKa{`=+h*(ov8o095E7jm1LhSt+w4kK+jFJ+cxNBP&w2O8-wmE&Yu0Jm`b z=m)SU2J{#sTc+;<354bwP~O)1!|~-Eio9tF?Ad9Zzg8}YnqllJZ&gxr^YhVd^0q;r zh5xI1`fr;iVRANKtP*5RCM7F{(00PG#GR6Zxz=Z)8Z*6r6`ynUe%#P><@nE!DcmWj-SXX7yum0c zpA+r20-@L4`z9NgzmNe??13|oJ}%G(Bmf4jVGF!;C2 z+XjPwyS!~M__xd327`aQylpTTw!Bz5!;MFV8$){ZHdtY__+sdK$C^wI?v7G}J8Q3jCn+8`gd>Nh!FZ zyt-N}fYoxs8!JO%D$H2*;e->HH|Ex0PWdFz&CHuDXUe!#l!W=k3M_Zda$a6kd_?ZgLqXaH3rgClWZ46(Z;R$}64b^!!wsNMab_ z$ATY%IVMGsq&csT~x|m#I87f1U zTs8}~X2u1ak*1w(^lFVoHc3q23X0+Bjrtq(MhSYw6$Pbrk~x;ejc^{|$9h}|j@R8E zg5z2Jj0D5-O~ZOqC702y@jN(P^0%UgqFaqU5v3(!%}&=xUoFc?%P*>&RT^M_RNd5~ z^16(w8nHTpIdgp5)h0PAqrJdO6QsnOgF65%Eb2b*8lAHxt zMyM&R#Toz>%EZ18Nu4cfidfclBD=n#4j8kF%CWA7Yb80 zTxQU>D3s5}J`=H`Lk2Rq*Xk9S#T^p?cwAb*mL060LIa~OPzpq6F<)$);36`%`haD5 zB|AClT3j`|8vcPLSh2qXU7iR)y7G!5HxW3E5g4`{5T(>vk44?-BY2TBY@~Cp=xK9A zUn_TE7zLpE95`SRmX$80k~%%9%wMLa>u#e(Q}{Q1_!o(V$@JQj7C{r$?I!Dm+9MPW zGvl4pvX{ZBw=r3F&S6d#J!0Gz>-hRmG}k|X_4E7dyvWa7_Y7?C!2&qZL*fSw(0>nq zl#GLVXwf@3BLXK@%S|z|Ra@l1Zm!(R|BdZ2CZ@Q9F}hh(zwUTJLrWNZYm+v_@Y|z1 z{eZ3@4Ben1F${jtP-Nl<+eed#)@N?6G2&wItzvx3Ct_{M6(i5ZNI!LU2?j58$aWEO>(kT3}uy$x2B&}zG?&MUlMsg zACXd&Bc#(F?(&_D7tp7z?(#L^T!-`HfY&_&I35(4 z^aOV5GJ)_{O7xTLXNsie7-+%PAJ496)N;B_4yvaxuhI$ocsS9a6D_4c7@Nk0Oy3+KHqCN zKTzxQCG-UU2EcKC0Oz}KPQz1zU*fzK&q{wuz3_1QILNmZuPdy8JO^+tfql=$L;ZK* z`~c2h;>^c?_;Xy9uQ=w#IR^Ia8QtdS8Atda{BL}y%Xc~v^j~m%@IyZF+wSi2ZR9g@ zF>x6&@e^YE%#ChPE*LQO^s^F9CJB~L!~e8tKA(83-eKWA?Y?oPKj|wr1p-9M~Re}XqTi% ze10-$^;Dm)C(ai90^r{W{0xDg?#E99Ui9N#e+ggc4}1;yM^hgCN73t#BwTbVI?qav4w6Baau z{-8Ib*yme=Ym0xuZn}SEq0wFo|4I$;ue6v%xCd-09gc;iKP&;gji7f5Nt#kI(K#UGRwTKfMmXK-VkX^auLXdGM(qb&w+b>j3EG&Gh+dY5y4k z^q^BWJGo=pA|ek3CzxTiKl`zH6+YjzzMy-309}9E4)9w?WKxfyf|`EIxc4IZm4R%j zcm03$IOgit!*yfi(x_H-Nsoz*JtLp$(HnhnYM)5=(MfR^%h9y7Zw1RA#yE-g_+pK) zhu-(O7)Ntr8XS?C5iyG$5veh4jo)y3!)zBKj^sqdtaU_8h}j78J3xM?Bb(u6ad7ue>k~OM3I)=DKVHWt8H{_t ze-g$kWwAcrk4P)oW?b~g8o#ecyt)Okc{_Tr*U*K%j+pQU6nQfyTJ&-Ydn{2~qn1YA z7$M@#wQLMOm)O7wrG^vM~p`v$9O#tZ?O|?N3_E-jPT@7w0~SAkMa3N zAr3^Xl=Q>&$yqUNQIYk*`?-lm{ANAzYaEGqRyhX)+^D@rq?;MDGY-98kZuOi`%KWg z1oWcs?DBOX9d-B_tDj`W>{laJnSPSeXAp26fG~^UC+8w9<9xNLcYlDkB7ZIDoyvTS zW-mo<8j>FY-iC6eg!}!=-ws`j0WOXwvtyELf=MjK@drfsd@>5jH=@+9qLjT+cn1KG2A*^V8iqq%s48DOhA7I-5$ZQl$~O`BfFQC* zU6rUPiz3vm{glHIV=&B%Qh)8Iyc2aVvN!Zpf9t0#jXqCNZttaj(NFoJ*GkkbO3j*f z6Pisu3@QJ#)e7xz)#r4Wr#V;=3J?2l4k>!a*cv3S0-hk9opG<9`FUu99WiWkpp zj8-4)t32KdTp#Vtg17b|%7cAWlzg@iQMUC_LH3V6PQhxziw^adp33JA6|WMzCqn(X zr}9+9Sw2;{A@bk9tI9o5>MyGDc$A87%(tqL;cZm~_-j=K%Ap|T7bkzlnjWiQgJUS2dDAl%@X zi#xp~4)wm{luZtG`*BLALw)!-<*^8LO}z3_q`Koc<+n)nmE)BAqtunijZ#+}r)*Z$ z$KsXid#De^E06V1?~PYFd#HzxrHS_gzo)t(Ub!Cz0z6uMHeT7>OMM6Uz125>-&=hd z@ZRdyc;%fw9qgthouaHr`2GY%xnumet#0Mh z@ef6IUZUKbq5gb{vOQzD0~bGJsvo+P`(04udzX43Pr2Kz?wqdda;q<2rYxSI?#x$S zn4qq`RJnbk`rr&@$3%6*h01Re)vqs69?eo;nyl=}QkNAfOS07!1LHNKR$rW= ztj-bi-_3mjJ<@Gc)U8vLMN>N?!19KR)ekOK{&%tZ$;HZ})7AH;D^FhX9lT)E3{k>c zr2g-6<&De5#ld2AU5WC3iMZ%0RX5L6KAR~nTFTVd%apInZX?MhSEz4QC@)>1E}E_U zbj3J8KC4h4ovl1nqrO$6JY1{ZQm1@W3mxyTQ~y`5?5;y{TfMq^u5z$m-8olzY!22K z9-61#JYU%}4|(^^SD(5{*)<=@tyifJFHnBDN_}&I^61q-d+-|drfZcCuR-3u*Q#r- zQ{KN8$t~BZ4_)t24qT_cfzu-m4q!fTgF|g=bSNL(-~h&|Mu+-jlS6s0kyo3W9O{D$ z9m>x*zmD_jW|p{rkwd+4u|wIth&cBwcBubFRld8JSDRWK>H{8!@)ORl;r#G_SmHi3 znWe1`W!Dno+}-L>pIGWp-f894#-$GR{x*m5BhIhl{Lqao0WUx9RaL#AxAJ{Yby08S z#ppDo5A>D|b=0ptc8EV#?N*eV`lt>4l(l`3Z0f6i-dB07 zZ;>GQYXn;B&1Wf_Bh=Qjlsgc*&QkVAst1yk|3<0bB`G(n>RU<57pl4@NqM6OA>Z^+ zzZs=;^kjN(Pj&k!r7e0O($6E1jZ*eUpNaJ9UTV`QWo_?7q`&K}-hP&Hcb_#;xcD$u z{bHnYa~uob6Q}MOseBwa?d6fmjYo|Gr0b{wNI!72>Kmbaa`ZT)pX^U~!!at-YmXVT zX{2)a7aJnR&;e9MbDf9`VO;<@HlY z=fElId&8CO2`u|*!dSwGF#X{W^_k(y;!_VhaIqv&Jvdz1nD{?l{5DiQn4~;$`ZT0l z&rtt7Ls@?YrFeT7>gxTG)CQG#e^lxf=*!?-8OpB?^uAvcU4CRZyZvlIKjQVJX z(iw9<(ktTBmQ3YP97^8t549;%S$q_dj~u18XDEk{nu!UXqt(|ll#TrnX};~RzMY}$ zI0n7UFUP36GL&t{0=n;5^}bAH!*NJ@k5eDcR9=Wj@{4$Nd8YF0@koApyt*<|d1L^R z?+j3H$y8PjM6zR`dRvC_+d!!MpF!%aE@kB)^`}gwQ>1S=L47k**?a=wADo~*o~hh1 znCUfx)dyY5HzNJ>V0EQSS$5*29WLbsCv&$s)z31NUz}-3?>p&cxccG*^}!6KBjI^y z+I*_IDnohpR2LllH1(ql<(bnC1H5*qx+O!=h8CggJzd?Ap){P)g{Zb_)Cr*S$|&`| z@k-NK=VIh+u3Soy3%ltiu7~mR4mO<9y?cEI9_QvkN8iYr@lH?`TjiOzc~6% z2avuPqwYOlx##?GNN*ntZSNX42&FcUQ(yb1(lu^-1TJ2=Ky64?9!pk{em+^njK??0 z_c7y^)c>!&tB;PWxbCwnk7R;n%LdCrHfGbLK1mKNyDMoWQ%asCud%>58%sDYZS%5T zt>gtstFyZjHvJL=oG;=~<2Wt}i7~{4khD(Y7%(4J0vOXY1oGh$LJ|?E@oB;#gkVT| zfd1~xoz;79-ve7s|L8fMbENy;Z|CdXxpU{vym@!z_LSJwb#v&I<>IR;%H!FURM4lp z<<^VD6Wvr0n>96su!6~upKXF;x&`2NSU%Szo(SXeH(`0CNqoHqm;YWPC!54)XW(-040#v>zC^Ci zN%^BDv5h45u3GtYlX#ZOy0%XKyh(hm?rM~^w_g6BNxWE(GOnLZ!~7$&arxYAd0&&b zc@8f3&yi0yiI?bdOM`q05gI5$gWTIB9-G??$($?qHHm+k2Xx;&d3Td|dLGJn9kSgd zzI@te&=z=}dA|<-0b&4~;~6P#l#k#V=*L14aCDWttwD?-`He888pQqdyibMX?gnu! zG4vMjwn2QCE{}%gt_HD{c>6Ckaz}&s30)qmk=q-@PGa-@GvrMT;(5BfR?06lh<_$F zLsx&XLA*kjTWjUc26104E`L%hpPeHP%zO~@Bh+PA=z^T$^G6qm$3nQgH!NRl6#pH5 z0sR^_(H9not#w~Uz4y-~3+o4`fTu5>B5#8Ubt-cE>8bK)qqt_)b%_1RIr2!O*hRhl zK!e=5Kpbnh6xPIC`Naib&U?{>r^)9U#eJuJ2H8D!y8LCM*fAe`+BskDULYQ&%O~c` zhZ@D_8*zDSqujed{H$@Fs%GL}Lw`V*gFXGdI`KN$mDknE7wW{mTIdTr(y!{om&l~N zy-vPZCyuan`Tjj2`CHhkVY$Cfe7r`&$LkMkE^*_lve?SuOrj)PNiLcfE zHLmZZ)EMGa!8^hwxjQ1R4#_ttcV~YybR;BxAHwCyaFPQjIdGB#CpmDE11C9fk^}!= zIFRnLR8`ZRn#TBEf%pe|RRu5Ygh@)l;C=P(`>Xk$UeQokJ6lsM8LF`EJWElICiY&j zOx@#^oeGEU&nSxNlL}@}38o(^JjuUf;-SJ!rmwv5W`Qnuge@Y%m}cXR`W_P?72YHD zJzgcKZ~-iO3Ro&v0hYW9^O+g~V_$~}T|Z`nn8_K@Iy?nTF;u|pOoepJQqw6Yx_{to{X4_|=j15!-&@djZr9yR_c1-l z^f1#SOph_G{WD$vMy5-cCYi2ax|V4_(=n!-nQmvgo9RBL2bmsbdW7jQrnTpD`AnBE zO)_1CQ>^a#^qOl$v~%V)ZTX_Dy*rfZq@GaX~Pndx?> zyP57|dXVX1rbn0_V_JIwm(O$w(EpXnIW%}lp5-OY3#(}PS8Gd;re7}Hvd%V)ZTX_Dy*rfZq@GaX~P zndx?>yP57|dXVX1rbn0_V_Lh6%V)ZTX_Bcp;V}Mw>T|KjfzCWid7PL_PHZmxNsg+!_}|Bq(O7G=?Y$b0Bx3PoEE!wsD#3eGgVC-dYTr###wK>@p@tT$ZL(5Ez-XDoXWWG&Zj|4usz_$=ZaZ38S$I?rAEvXJ3sG- zpC)KKqBqpwZMLy5oWIqG`NG&0ZoE-U`iyrqVuA1*2Wv!=cM~@zR^#AIRPciR${JCXm|MVo#@qtEoWt7U93?w5C8@(OwI} zz4jsPF9?Z+V*F8EPpSun9{h!T_<0p@%E#E96qmwg{Dpkl<;%eRp6g!Vl+SpVg)6+; zsZm5c4?_4yF0>Tzn1;n#RLl=Q$arj}j=zfIpBm=;8NZS7HH_czI~`x`4aRT(#(1RL zjklm)RBr4djh}|N6t*#*W?b#{1^yc2lb2W&O?+Sdhth??NBQhoZ7IYY)bSz4V;|Hw zy@;gnamF9-vD7{7m8I|<#wUMase9TZL*ZN)%9MY))1q6kMEw`y1Hh?$ulBSrwX-pB{kIUSGd_tN{Ek6%M)j8%q^Rdk!hkHwAwA z&>l-a{A--gkf*Y=dYPw|Uwz0emj-m5h_VqcCf(e|@?U->;ru2*B?Pz~^G3=9dp%Ac+X-yy>_(;j0AH>>zl_QM+* z7P|uS`L6(c4kq@*hjkt~Spr-=SG_Km_GVM~5b$|;-W&E>>R#F2ii+d;Z@FOob4Z04`Ln!)vund0XXfU z^^5 z^gQW@@lK1PiTA1h*lrMz|Ly?%o&fxJ0eA{bqjE=fTB<}EzbR}~xW0D&-x7e|37noc zoptNmgTU!|C;53JoLM6bSBk&$6-yyMX1pCZl2nB?0eCI|e+dmY538@<`R3ID@qZqG z&%pw0q^dSTybHKrJ(Ivmt{!K7YviXVAbt)w@gXup^X*+BUD!6pN4Q@Y`8+>B4(|xa z|N9D`EhZnaRL1Xv{XyY30r6*Hf!go6;=t#k9mOLVmA--NxiTRBUje6fZ#bmm8$JAw z3dch*f7(=TnC}GSbB5{{9LYh>3Bdm$0RL(L{zw4+0&ubqy>{0hfcweqJut zAbuQzO6@hm?KQ4)7b36v5#Kn-mH_;Dh0hiiueanl&8q?N6Bq~l_;5`C{sZ8|&+%iH z%2|CQiK|+?$nnc38i21U>^Oxd~ASh<`0`zjpjKa2g+XvOSD$rNWb(4}IL~ z4A{8jThE2)TBL_3f2!ltw=5LSV?1)ZrS9q5D+-q@Twgo?e=`98Q2_qy0K5hUj2}N2 z2jC+C_%`4XX9f9sPeA-b0r(dIM;%wlgZ}0Z#+f~4+y<&OYU(~lyDEn@e zPj!VRIsb;oE%hUP>qO!8fcT3R>vo^;w0jG1YRAZPmP$`{359jQBkIciZ$QPz_hOdi z`e8G0dM@K{0b;MhtpWMm&-u7A!OhPD@N>`dZ{P6%d|Lp1R{&lY(fUNNJ{;$Y^#G@K z@y-*=j0?`6e37;h2VKds|1Fe=Q|B2E6C>w~f$AQ$6H;tW?ftOR4m9Sk{xY^Nxq; zh)+*KyAsr$)?D1vVrx-v?QpWaoet;9lsBZtoM<9(rLXV1R64yBdc{^Q@SF+a9qQb! z{tP^_sbgkvc3|rAbUNNf(T&40JKwjaCvFcpQg)OJ==AV6MDjHqHuc)+73eS>`9jG? zpUL$swNnnZCEJC~Uh?hI=z6=`?)g9`PUFO>RCE+U%f=Q|rl-8B!?yDUQex0+E9jsC z8$Q&bQnh|H*B7q7)K0DB2rHMbA-!2D*xJdHu$TV@9Hw@60qwn{tCE3BHTL002T&@b zTU2VO&%LtM38GZAlB0e*yWtAUx^Pu@XP2FbwnSS+e>pceI0Yr5B;fEs$I!ZUHpL?S zS!jW?zf{)!L&a~F@@!~X?A@8pB1D>adzI>LhhwYJk*-w`l49=4{BWPU>`H#xpW1O` zm`JD`s|s>SVLYka<*`$=w?&mxz<$hq|L`9|c8qq;2OKih%Tgv$)zZm5tY?4hJp$S)wIrz0|&zEfSgy<99p}zSU3#c$UNN}~a ztE{TJM-xyPIr7RYyfSZ(!0~8;S{t=HQJ-AxWINzoNYX1VW~l;+)s@Z&<>SZC6D@Hm z#&mt?D3$3Jb1JkVfbcX(z9L-`$cG2IFeMO`S}w1urQ5@+J8Aa^a1d%=e%K~AjVRjb zCA8#<>f(CaM&~G0E!)JZfbJJ>Q4SC{tjkq!7;-elr`(F;ItP8bijRHMYxl-r z;UsOBKtTm&H)d?(wcW-MU^#UzsbzJoTZi)_?0B@rU%1t1S$R^Izqfc2(o5&bD*u41 z^251QKAnas?fX#i7^&+#4voDEy<0t5B4*p=4RB#-gINRR4YsnpyuAUU;W{?VB^pWq zr_~K`1nt0Z3gdaJn^(FYR2~25DYdtII%6!_p%gS0*_V+zHumaUDOY#XI=Qpl-ES)0 zx#C~a^Ayu~27m3S>)xhyvsSP{`rT7|C6iU1qNhNr3f^dQ#d7yh3YL7K*V9Z(NzQr- zZT)njX|;CjUAsCC@q&}Y%=-LTJWet)2xV>4QGUo7ecC1b)YIZiEAm(vz#!uo4YV0X+|G-&0X1K28|jN6fnNc9&cw0O&ca%d2Zu(O~VE zslcISu2l`C=zql8N}qR4V@Sno#yc7)eeqae6ZQ5^N2nd;R~s)>1yZIXo2D8euFNza zT^eu2DWILo%Xv4sD%RDBbFNs4e&(=zPR!DBTbHDG`<3zMQ+!bL%%}4AKt?(iXudo$IuU)IOQC|Yruo9sTtam?WMO(A{c^cf; zP|&@xxA%|P25;5ws{6a$qPwcuU&`eib1c;@VCCVU3YvV`y!<}$;hy0@9ez$ zQn{-0z#PrOm$kP~^<}&Yh0Gb(2u#LaDgSnZFvdZLIyr9V#?$N7tyq`sOZm)$h61dT zx0@}ul8Kv2L#TaBp}6hVcS?)p+o;D3qy~{Dp~iw9-1p^+ZVu~FUVAaG8j^gZLmwC9 zmJXX{u+$l8p=4@&uNpnn1O^ct?J)H;m_G*?c074D_CYzRPF&Ab*^7>{O^1kgkW6Zc zro)iP7NH5MBWm0|h_xhKXE9@u4ChMd=^CHKjIM;fnDMNzVEF(hpy2_QvRjSC2cfnL z{WgZdft=kxR7UYvx!NF_pj920js6y6yela^hhs9|+6^i39hn>~KTgqyjeCjB>I9<` zYV9*Z*@OA@Svy-ARhm!N=6DPh-?lsPI<}KT_u^PCr-ls&Locr($aG-2H>2xlz>XZPzcM ztgwEmMk&9cCJ~3x<7@G#4)8Whjac~0Ws0eaB7|X#m4Cx7k4+q0ab=&YQOxzoRD;4` zDV4A8nfW$a1|IN;6UoIiEJZ!0P(Q4ecB80yMrO`^b2GlX_Yy2KnfH%a|E9N(y>vE- zurD2^MPnE#rO_c`Ojj2csMTV#Lxzr1bI|-P3k_M$rPlb=)t*us^!5_ma9BN6eJiDv zN^%TmX%vdxX=5LCVB&yd5OT#f^`8zSfT@;BDPv4e@98aSYBgrw1H+>AW%5jg8-mxxox2;H#0rvWSwwETXuK4is=fFC9cwDV%7rpkBE|a~pVr#D+d(V(yT8 z7AtgWAsgEdNIj~|^xg@bhq=Byi-agRGnCIFU7<_`0--lLQ5b{*?>UF)m@D`0kZIpx zo0JIQ&+I$u2wS+rc@qp^+mb6#TxWO}Zty8eC4Fz$!X11UbNYz7oBC(I!!Sr37tZwM zu%&(~b9%EcTY>oJO}`x2F?`!^>c1mw0U^HziijJSVjH6hyiJtvgjkI$+7e~bo9{~B z!wJ@5e=!BxmPpTT(qr473TyG7wzQe_=KIre?5Ct){3Yl+ork$5`eFQ$uL6_ae5d-r z**aE{8E)!t;u(4cqLJ?db8o(9jhPLZoSy1WyfgLJxeIX>IWlzK>1EG+_KaQ2lAImpACJb@v%aF!&%k#t&fuwz_$hvzjX&yx6wLRChwj%gz5Mo; z|2)#*Q;kY`^Ih|koZcJXd;c=hQc0$K^ZoPWai)I_X-Py)dh?z1QB!|zSR=-!eDhp8 zJms73sb4#wpWnnsG!@Lfp&>+b*1z)I^@L78Vk#Dp{vDq5W?%XEgF5}YHR04kL@HPM zg7(`}T}(0Nd&%1#(g_cF1m2r|KB7^~n9i>9-=uG?Ao@b*s?!ic>fMyBW9a|qzpi60 yHVODg&#fxk^gU<#C%OJ3*q2TL+eroKNW*udKdFEFmA~?7UDEF&h~Nm`^8W|42Fy(W literal 0 HcmV?d00001 diff --git a/test/ocl/SimpleConvolution_Kernels.cl b/test/ocl/SimpleConvolution_Kernels.cl new file mode 100644 index 00000000..08dcde88 --- /dev/null +++ b/test/ocl/SimpleConvolution_Kernels.cl @@ -0,0 +1,175 @@ +/********************************************************************** +Copyright �2015 Advanced Micro Devices, Inc. All rights reserved. + +Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: + +� Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. +� Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or + other materials provided with the distribution. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY + DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS + OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +********************************************************************/ + +/** + * The kernel has two implementation of convolution. + * 1. Non-Separable Convolution + * 2. Separable Convolution +*/ + + +/** + * NonSeparableConvolution + * is where each pixel of the output image + * is the weighted sum of the neighbourhood pixels of the input image + * The neighbourhood is defined by the dimensions of the mask and + * weight of each neighbour is defined by the mask itself. + * @param input Padded Input matrix on which convolution is to be performed + * @param mask mask matrix using which convolution was to be performed + * @param output Output matrix after performing convolution + * @param inputDimensions dimensions of the input matrix + * @param maskDimensions dimensions of the mask matrix + * @param nExWidth Size of padded input width + */ + +__kernel void simpleNonSeparableConvolution(__global uint * input, + __global float * mask, + __global int * output, + const uint2 inputDimensions, + const uint2 maskDimensions, + const uint nExWidth) +{ + uint tid = get_global_id(0); + + uint width = inputDimensions.x; + uint height = inputDimensions.y; + + uint x = tid%width; + uint y = tid/width; + + uint maskWidth = maskDimensions.x; + uint maskHeight = maskDimensions.y; + + if(x >= width || y >= height) + return; + + /* + * initializing weighted sum value + */ + float sumFX = 0.0f; + int m = 0, n = 0; + + //performing weighted sum within the mask boundaries + for(uint j = y ; j < (y + maskHeight); ++j, m++) + { + n = 0; + for(uint i = x; i < (x + maskWidth); ++i, n++) + { + uint maskIndex = m * maskWidth + n; + uint index = j * nExWidth + i; + + sumFX += ((float)input[index] * mask[maskIndex]); + } + } + + sumFX += 0.5f; + output[tid] = (int)sumFX; +} + + + + +/** + * SeparableConvolution + * is product of 2 one-dimensional convolution. + * A 2-dimensional convolution operation is separated into 2 one one-dimensional convolution. + * SeparableConvolution is implemented in two passes. + * The first pass is called Row-wise convolution. + * And second pass is called Column-wise convolution. + */ + + /** + * First Pass - Row-wise convolution + * @param input Input matrix on which convolution is to be performed + * @param rowFilter rowFilter vector using which row-wise convolution was to be performed + * @param tmpOutput Output matrix after performing first pass convolution + * @param inputDimensions dimensions of the input matrix + * @param filterSize length of row filter vector + * @param exInputDimensions dimensions of padded input + */ + __kernel void simpleSeparableConvolutionPass1(__global uint * input, + __global float * rowFilter, + __global float * tmpOutput, + const uint2 inputDimensions, + const uint filterSize, + const uint2 exInputDimensions) +{ + int i = 0, cnt = 0; + + uint width = inputDimensions.x; + uint height = inputDimensions.y; + + uint tid = get_global_id(0); + uint x = tid%width; + uint y = tid/width; + + if(x >= width || y >= (height+filterSize-1)) + return; + + /* + * initializing weighted sum value + */ + float sum = 0.0f; + + for(uint i = x; i < (x + filterSize); ++i) { + sum = mad((float)input[y * exInputDimensions.x + i], rowFilter[cnt++], sum); + } + + /* Transposed save */ + tmpOutput[x * exInputDimensions.y + y] = sum; +} + +/** + * Second Pass - Column-wise convolution + * @param input Input matrix on which convolution is to be performed + * @param colFilter colFilter vector using which column-wise convolution was to be performed + * @param Output Output matrix after performing second pass convolution + * @param inputDimensions dimensions of the input matrix + * @param filterSize length of col filter vector + * @param exInputDimensions dimensions of padded input + */ + __kernel void simpleSeparableConvolutionPass2(__global float * input, + __global float * colFilter, + __global int * output, + const uint2 inputDimensions, + const uint filterSize, + const uint2 exInputDimensions) +{ + int i = 0, cnt = 0; + + uint width = inputDimensions.x; + uint height = inputDimensions.y; + + uint tid = get_global_id(0); + uint x = tid%height; + uint y = tid/height; + + if(y >= width || x >= height) + return; + + /* + * initializing wighted sum value + */ + float sum = 0.0f; + + for(uint i = x; i < (x + filterSize); ++i) { + sum = mad(input[y * exInputDimensions.y + i], colFilter[cnt++], sum); + } + + /* Tranposed save */ + sum += 0.5f; + output[x * width + y] = (int)sum; +} diff --git a/test/run.sh b/test/run.sh index 4ba2110a..dfd15e34 100755 --- a/test/run.sh +++ b/test/run.sh @@ -74,7 +74,7 @@ export ROCP_TOOL_LIB=./test/libintercept_test.so export ROCP_KITER=50 export ROCP_DITER=50 export ROCP_AGENTS=1 -export ROCP_THRS=1 +export ROCP_THRS=3 eval_test "Intercepting usage model test" "../bin/run_tool.sh ./test/ctrl" ## Standalone sampling usage model test @@ -120,6 +120,12 @@ export ROCP_DITER=4 export ROCP_INPUT=input2.xml eval_test "libtool test, counter sets" ./test/ctrl +## OpenCL test + +export ROCP_OBJ_TRACKING=1 +export ROCP_INPUT=input1.xml +eval_test "libtool test, OpenCL sample" ./test/ocl/SimpleConvolution + #valgrind --leak-check=full $tbin #valgrind --tool=massif $tbin #ms_print massif.out. diff --git a/test/tool/tool.cpp b/test/tool/tool.cpp index 73aa245a..81626a2a 100644 --- a/test/tool/tool.cpp +++ b/test/tool/tool.cpp @@ -473,19 +473,20 @@ bool dump_context_entry(context_entry_t* entry) { const uint32_t index = entry->index; FILE* file_handle = entry->file_handle; const std::string nik_name = (to_truncate_names == 0) ? entry->data.kernel_name : filtr_kernel_name(entry->data.kernel_name); + const AgentInfo* agent_info = HsaRsrcFactory::Instance().GetAgentInfo(entry->agent); fprintf(file_handle, "dispatch[%u], gpu-id(%u), queue-id(%u), queue-index(%lu), tid(%lu), grd(%u), wgr(%u), lds(%u), scr(%u), vgpr(%u), sgpr(%u), fbar(%u), sig(0x%lx), kernel-name(\"%s\")", index, - HsaRsrcFactory::Instance().GetAgentInfo(entry->agent)->dev_index, + agent_info->dev_index, entry->data.queue_id, entry->data.queue_index, entry->data.thread_id, entry->kernel_properties.grid_size, entry->kernel_properties.workgroup_size, - entry->kernel_properties.lds_size, + (entry->kernel_properties.lds_size * (128 * 4)), entry->kernel_properties.scratch_size, - entry->kernel_properties.vgpr_count, - entry->kernel_properties.sgpr_count, + (entry->kernel_properties.vgpr_count + 1) * agent_info->vgpr_block_size, + (entry->kernel_properties.sgpr_count + agent_info->sgpr_block_dflt) * agent_info->sgpr_block_size, entry->kernel_properties.fbarrier_count, entry->kernel_properties.signal.handle, nik_name.c_str()); @@ -658,10 +659,10 @@ hsa_status_t dispatch_callback(const rocprofiler_callback_data_t* callback_data, uint64_t workgroup_size = packet->workgroup_size_x * packet->workgroup_size_y * packet->workgroup_size_z; if (workgroup_size > UINT32_MAX) abort(); kernel_properties_ptr->workgroup_size = (uint32_t)workgroup_size; - kernel_properties_ptr->lds_size = packet->group_segment_size; + kernel_properties_ptr->lds_size = AMD_HSA_BITS_GET(kernel_code->compute_pgm_rsrc2, AMD_COMPUTE_PGM_RSRC_TWO_GRANULATED_LDS_SIZE); // packet->group_segment_size; kernel_properties_ptr->scratch_size = packet->private_segment_size; - kernel_properties_ptr->vgpr_count = kernel_code->reserved_vgpr_count; - kernel_properties_ptr->sgpr_count = kernel_code->reserved_sgpr_count; + kernel_properties_ptr->vgpr_count = AMD_HSA_BITS_GET(kernel_code->compute_pgm_rsrc1, AMD_COMPUTE_PGM_RSRC_ONE_GRANULATED_WORKITEM_VGPR_COUNT); + kernel_properties_ptr->sgpr_count = AMD_HSA_BITS_GET(kernel_code->compute_pgm_rsrc1, AMD_COMPUTE_PGM_RSRC_ONE_GRANULATED_WAVEFRONT_SGPR_COUNT); kernel_properties_ptr->fbarrier_count = kernel_code->workgroup_fbarrier_count; kernel_properties_ptr->signal = callback_data->completion_signal; @@ -881,6 +882,8 @@ extern "C" PUBLIC_API void OnLoadToolProp(rocprofiler_settings_t* settings) } it = opts.find("trace-local"); if (it != opts.end()) { settings->trace_local = (it->second == "on"); } + it = opts.find("obj-tracking"); + if (it != opts.end()) { settings->code_obj_tracking = (it->second == "on"); } it = opts.find("memcopies"); if (it != opts.end()) { settings->memcopy_tracking = (it->second == "on"); } } @@ -901,6 +904,8 @@ extern "C" PUBLIC_API void OnLoadToolProp(rocprofiler_settings_t* settings) check_env_var("ROCP_TRACE_SIZE", settings->trace_size); // Set trace local buffer check_env_var("ROCP_TRACE_LOCAL", settings->trace_local); + // Set code objects tracking + check_env_var("ROCP_OBJ_TRACKING", settings->code_obj_tracking); // Set memcopies tracking check_env_var("ROCP_MCOPY_TRACKING", settings->memcopy_tracking); diff --git a/test/util/hsa_rsrc_factory.cpp b/test/util/hsa_rsrc_factory.cpp index 35568ba0..d23a445d 100644 --- a/test/util/hsa_rsrc_factory.cpp +++ b/test/util/hsa_rsrc_factory.cpp @@ -140,7 +140,7 @@ HsaRsrcFactory::HsaRsrcFactory(bool initialize_hsa) : initialize_hsa_(initialize CHECK_STATUS("loader API table query failed", status); // Instantiate HSA timer - timer_ = new HsaTimer; + timer_ = new HsaTimer(&hsa_api_); CHECK_STATUS("HSA timer allocation failed", (timer_ == NULL) ? HSA_STATUS_ERROR : HSA_STATUS_SUCCESS); @@ -167,7 +167,6 @@ void HsaRsrcFactory::InitHsaApiTable(HsaApiTable* table) { hsa_api_.hsa_init = table->core_->hsa_init_fn; hsa_api_.hsa_shut_down = table->core_->hsa_shut_down_fn; hsa_api_.hsa_agent_get_info = table->core_->hsa_agent_get_info_fn; - hsa_api_.hsa_iterate_agents = table->core_->hsa_iterate_agents_fn; hsa_api_.hsa_queue_create = table->core_->hsa_queue_create_fn; @@ -175,36 +174,39 @@ void HsaRsrcFactory::InitHsaApiTable(HsaApiTable* table) { hsa_api_.hsa_queue_load_write_index_relaxed = table->core_->hsa_queue_load_write_index_relaxed_fn; hsa_api_.hsa_queue_store_write_index_relaxed = table->core_->hsa_queue_store_write_index_relaxed_fn; hsa_api_.hsa_queue_load_read_index_relaxed = table->core_->hsa_queue_load_read_index_relaxed_fn; + hsa_api_.hsa_signal_create = table->core_->hsa_signal_create_fn; hsa_api_.hsa_signal_destroy = table->core_->hsa_signal_destroy_fn; hsa_api_.hsa_signal_load_relaxed = table->core_->hsa_signal_load_relaxed_fn; hsa_api_.hsa_signal_store_relaxed = table->core_->hsa_signal_store_relaxed_fn; - hsa_api_.hsa_signal_store_screlease = table->core_->hsa_signal_store_screlease_fn; hsa_api_.hsa_signal_wait_scacquire = table->core_->hsa_signal_wait_scacquire_fn; - - hsa_api_.hsa_system_get_major_extension_table = table->core_->hsa_system_get_major_extension_table_fn; + hsa_api_.hsa_signal_store_screlease = table->core_->hsa_signal_store_screlease_fn; hsa_api_.hsa_code_object_reader_create_from_file = table->core_->hsa_code_object_reader_create_from_file_fn; hsa_api_.hsa_executable_create_alt = table->core_->hsa_executable_create_alt_fn; hsa_api_.hsa_executable_load_agent_code_object = table->core_->hsa_executable_load_agent_code_object_fn; hsa_api_.hsa_executable_freeze = table->core_->hsa_executable_freeze_fn; hsa_api_.hsa_executable_get_symbol = table->core_->hsa_executable_get_symbol_fn; + hsa_api_.hsa_executable_symbol_get_info = table->core_->hsa_executable_symbol_get_info_fn; + hsa_api_.hsa_executable_iterate_symbols = table->core_->hsa_executable_iterate_symbols_fn; + + hsa_api_.hsa_system_get_info = table->core_->hsa_system_get_info_fn; + hsa_api_.hsa_system_get_major_extension_table = table->core_->hsa_system_get_major_extension_table_fn; hsa_api_.hsa_amd_agent_iterate_memory_pools = table->amd_ext_->hsa_amd_agent_iterate_memory_pools_fn; hsa_api_.hsa_amd_memory_pool_get_info = table->amd_ext_->hsa_amd_memory_pool_get_info_fn; hsa_api_.hsa_amd_memory_pool_allocate = table->amd_ext_->hsa_amd_memory_pool_allocate_fn; hsa_api_.hsa_amd_agents_allow_access = table->amd_ext_->hsa_amd_agents_allow_access_fn; - hsa_api_.hsa_amd_memory_async_copy = table->amd_ext_->hsa_amd_memory_async_copy_fn; hsa_api_.hsa_amd_signal_async_handler = table->amd_ext_->hsa_amd_signal_async_handler_fn; + hsa_api_.hsa_amd_profiling_set_profiler_enabled = table->amd_ext_->hsa_amd_profiling_set_profiler_enabled_fn; hsa_api_.hsa_amd_profiling_get_async_copy_time = table->amd_ext_->hsa_amd_profiling_get_async_copy_time_fn; hsa_api_.hsa_amd_profiling_get_dispatch_time = table->amd_ext_->hsa_amd_profiling_get_dispatch_time_fn; } else { hsa_api_.hsa_init = hsa_init; hsa_api_.hsa_shut_down = hsa_shut_down; hsa_api_.hsa_agent_get_info = hsa_agent_get_info; - hsa_api_.hsa_iterate_agents = hsa_iterate_agents; hsa_api_.hsa_queue_create = hsa_queue_create; @@ -212,31 +214,35 @@ void HsaRsrcFactory::InitHsaApiTable(HsaApiTable* table) { hsa_api_.hsa_queue_load_write_index_relaxed = hsa_queue_load_write_index_relaxed; hsa_api_.hsa_queue_store_write_index_relaxed = hsa_queue_store_write_index_relaxed; hsa_api_.hsa_queue_load_read_index_relaxed = hsa_queue_load_read_index_relaxed; + hsa_api_.hsa_signal_create = hsa_signal_create; hsa_api_.hsa_signal_destroy = hsa_signal_destroy; + hsa_api_.hsa_signal_load_relaxed = hsa_signal_load_relaxed; hsa_api_.hsa_signal_store_relaxed = hsa_signal_store_relaxed; hsa_api_.hsa_signal_wait_scacquire = hsa_signal_wait_scacquire; - - hsa_api_.hsa_amd_agent_iterate_memory_pools = hsa_amd_agent_iterate_memory_pools; - hsa_api_.hsa_amd_memory_pool_get_info = hsa_amd_memory_pool_get_info; - hsa_api_.hsa_amd_memory_pool_allocate = hsa_amd_memory_pool_allocate; - hsa_api_.hsa_amd_agents_allow_access = hsa_amd_agents_allow_access; - - hsa_api_.hsa_amd_memory_async_copy = hsa_amd_memory_async_copy; - - hsa_api_.hsa_system_get_major_extension_table = hsa_system_get_major_extension_table; + hsa_api_.hsa_signal_store_screlease = hsa_signal_store_screlease; hsa_api_.hsa_code_object_reader_create_from_file = hsa_code_object_reader_create_from_file; hsa_api_.hsa_executable_create_alt = hsa_executable_create_alt; hsa_api_.hsa_executable_load_agent_code_object = hsa_executable_load_agent_code_object; hsa_api_.hsa_executable_freeze = hsa_executable_freeze; hsa_api_.hsa_executable_get_symbol = hsa_executable_get_symbol; + hsa_api_.hsa_executable_symbol_get_info = hsa_executable_symbol_get_info; + hsa_api_.hsa_executable_iterate_symbols = hsa_executable_iterate_symbols; + + hsa_api_.hsa_system_get_info = hsa_system_get_info; + hsa_api_.hsa_system_get_major_extension_table = hsa_system_get_major_extension_table; + + hsa_api_.hsa_amd_agent_iterate_memory_pools = hsa_amd_agent_iterate_memory_pools; + hsa_api_.hsa_amd_memory_pool_get_info = hsa_amd_memory_pool_get_info; + hsa_api_.hsa_amd_memory_pool_allocate = hsa_amd_memory_pool_allocate; + hsa_api_.hsa_amd_agents_allow_access = hsa_amd_agents_allow_access; + hsa_api_.hsa_amd_memory_async_copy = hsa_amd_memory_async_copy; hsa_api_.hsa_amd_signal_async_handler = hsa_amd_signal_async_handler; + hsa_api_.hsa_amd_profiling_set_profiler_enabled = hsa_amd_profiling_set_profiler_enabled; hsa_api_.hsa_amd_profiling_get_async_copy_time = hsa_amd_profiling_get_async_copy_time; hsa_api_.hsa_amd_profiling_get_dispatch_time = hsa_amd_profiling_get_dispatch_time; - hsa_api_.hsa_signal_load_relaxed = hsa_signal_load_relaxed; - hsa_api_.hsa_signal_store_screlease = hsa_signal_store_screlease; } } } @@ -329,6 +335,11 @@ const AgentInfo* HsaRsrcFactory::AddAgentInfo(const hsa_agent_t agent) { status = hsa_api_.hsa_amd_agent_iterate_memory_pools(agent, FindStandardPool, &agent_info->gpu_pool); CHECK_ITER_STATUS("hsa_amd_agent_iterate_memory_pools(gpu pool)", status); + // GFX8 and GFX9 SGPR/VGPR block sizes + agent_info->sgpr_block_dflt = (strcmp(agent_info->gfxip, "gfx8") == 0) ? 1 : 2; + agent_info->sgpr_block_size = 8; + agent_info->vgpr_block_size = 4; + // Set GPU index agent_info->dev_index = gpu_list_.size(); gpu_list_.push_back(agent_info); @@ -672,7 +683,57 @@ uint64_t HsaRsrcFactory::Submit(hsa_queue_t* queue, const void* packet, size_t s return write_idx; } +const char* HsaRsrcFactory::GetKernelName(uint64_t addr) { + std::lock_guard lck(mutex_); + const auto it = symbols_map_->find(addr); + if (it == symbols_map_->end()) { + fprintf(stderr, "HsaRsrcFactory::kernel addr (0x%lx) is not found\n", addr); + abort(); + } + return strdup(it->second); +} + +void HsaRsrcFactory::EnableExecutableTracking(HsaApiTable* table) { + std::lock_guard lck(mutex_); + executable_tracking_on_ = true; + table->core_->hsa_executable_freeze_fn = hsa_executable_freeze_interceptor; +} + +hsa_status_t HsaRsrcFactory::executable_symbols_cb(hsa_executable_t exec, hsa_executable_symbol_t symbol, void *data) { + hsa_symbol_kind_t value = (hsa_symbol_kind_t)0; + hsa_status_t status = hsa_api_.hsa_executable_symbol_get_info(symbol, HSA_EXECUTABLE_SYMBOL_INFO_TYPE, &value); + CHECK_STATUS("Error in getting symbol info", status); + if (value == HSA_SYMBOL_KIND_KERNEL) { + uint64_t addr = 0; + uint32_t len = 0; + status = hsa_api_.hsa_executable_symbol_get_info(symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT, &addr); + CHECK_STATUS("Error in getting kernel object", status); + status = hsa_api_.hsa_executable_symbol_get_info(symbol, HSA_EXECUTABLE_SYMBOL_INFO_NAME_LENGTH, &len); + CHECK_STATUS("Error in getting name len", status); + char *name = new char[len + 1]; + status = hsa_api_.hsa_executable_symbol_get_info(symbol, HSA_EXECUTABLE_SYMBOL_INFO_NAME, name); + CHECK_STATUS("Error in getting kernel name", status); + name[len] = 0; + auto ret = symbols_map_->insert({addr, name}); + if (ret.second == false) { + delete[] ret.first->second; + ret.first->second = name; + } + } + return HSA_STATUS_SUCCESS; +} + +hsa_status_t HsaRsrcFactory::hsa_executable_freeze_interceptor(hsa_executable_t executable, const char *options) { + std::lock_guard lck(mutex_); + if (symbols_map_ == NULL) symbols_map_ = new symbols_map_t; + hsa_status_t status = hsa_api_.hsa_executable_iterate_symbols(executable, executable_symbols_cb, NULL); + CHECK_STATUS("Error in iterating executable symbols", status); + return hsa_api_.hsa_executable_freeze(executable, options);; +} + std::atomic HsaRsrcFactory::instance_{}; HsaRsrcFactory::mutex_t HsaRsrcFactory::mutex_; HsaRsrcFactory::timestamp_t HsaRsrcFactory::timeout_ns_ = HsaTimer::TIMESTAMP_MAX; hsa_pfn_t HsaRsrcFactory::hsa_api_{}; +bool HsaRsrcFactory::executable_tracking_on_ = false; +HsaRsrcFactory::symbols_map_t* HsaRsrcFactory::symbols_map_ = NULL; diff --git a/test/util/hsa_rsrc_factory.h b/test/util/hsa_rsrc_factory.h index 9c0207e2..151dab8e 100644 --- a/test/util/hsa_rsrc_factory.h +++ b/test/util/hsa_rsrc_factory.h @@ -73,7 +73,6 @@ struct hsa_pfn_t { decltype(hsa_init)* hsa_init; decltype(hsa_shut_down)* hsa_shut_down; decltype(hsa_agent_get_info)* hsa_agent_get_info; - decltype(hsa_iterate_agents)* hsa_iterate_agents; decltype(hsa_queue_create)* hsa_queue_create; @@ -81,30 +80,35 @@ struct hsa_pfn_t { decltype(hsa_queue_load_write_index_relaxed)* hsa_queue_load_write_index_relaxed; decltype(hsa_queue_store_write_index_relaxed)* hsa_queue_store_write_index_relaxed; decltype(hsa_queue_load_read_index_relaxed)* hsa_queue_load_read_index_relaxed; + decltype(hsa_signal_create)* hsa_signal_create; decltype(hsa_signal_destroy)* hsa_signal_destroy; + decltype(hsa_signal_load_relaxed)* hsa_signal_load_relaxed; decltype(hsa_signal_store_relaxed)* hsa_signal_store_relaxed; decltype(hsa_signal_wait_scacquire)* hsa_signal_wait_scacquire; - - decltype(hsa_amd_agent_iterate_memory_pools)* hsa_amd_agent_iterate_memory_pools; - decltype(hsa_amd_memory_pool_get_info)* hsa_amd_memory_pool_get_info; - decltype(hsa_amd_memory_pool_allocate)* hsa_amd_memory_pool_allocate; - decltype(hsa_amd_agents_allow_access)* hsa_amd_agents_allow_access; - decltype(hsa_amd_memory_async_copy)* hsa_amd_memory_async_copy; - - decltype(hsa_system_get_major_extension_table)* hsa_system_get_major_extension_table; + decltype(hsa_signal_store_screlease)* hsa_signal_store_screlease; decltype(hsa_code_object_reader_create_from_file)* hsa_code_object_reader_create_from_file; decltype(hsa_executable_create_alt)* hsa_executable_create_alt; decltype(hsa_executable_load_agent_code_object)* hsa_executable_load_agent_code_object; decltype(hsa_executable_freeze)* hsa_executable_freeze; decltype(hsa_executable_get_symbol)* hsa_executable_get_symbol; + decltype(hsa_executable_symbol_get_info)* hsa_executable_symbol_get_info; + decltype(hsa_executable_iterate_symbols)* hsa_executable_iterate_symbols; + + decltype(hsa_system_get_info)* hsa_system_get_info; + decltype(hsa_system_get_major_extension_table)* hsa_system_get_major_extension_table; + + decltype(hsa_amd_agent_iterate_memory_pools)* hsa_amd_agent_iterate_memory_pools; + decltype(hsa_amd_memory_pool_get_info)* hsa_amd_memory_pool_get_info; + decltype(hsa_amd_memory_pool_allocate)* hsa_amd_memory_pool_allocate; + decltype(hsa_amd_agents_allow_access)* hsa_amd_agents_allow_access; + decltype(hsa_amd_memory_async_copy)* hsa_amd_memory_async_copy; decltype(hsa_amd_signal_async_handler)* hsa_amd_signal_async_handler; + decltype(hsa_amd_profiling_set_profiler_enabled)* hsa_amd_profiling_set_profiler_enabled; decltype(hsa_amd_profiling_get_async_copy_time)* hsa_amd_profiling_get_async_copy_time; decltype(hsa_amd_profiling_get_dispatch_time)* hsa_amd_profiling_get_dispatch_time; - decltype(hsa_signal_load_relaxed)* hsa_signal_load_relaxed; - decltype(hsa_signal_store_screlease)* hsa_signal_store_screlease; }; // Encapsulates information about a Hsa Agent such as its @@ -156,6 +160,11 @@ struct AgentInfo { // Number of Shader Arrays Per Shader Engines in Gpu uint32_t shader_arrays_per_se; + + // SGPR/VGPR block sizes + uint32_t sgpr_block_dflt; + uint32_t sgpr_block_size; + uint32_t vgpr_block_size; }; // HSA timer class @@ -166,9 +175,9 @@ class HsaTimer { static const timestamp_t TIMESTAMP_MAX = UINT64_MAX; typedef long double freq_t; - HsaTimer() { + HsaTimer(const hsa_pfn_t* hsa_api) : hsa_api_(hsa_api) { timestamp_t sysclock_hz = 0; - hsa_status_t status = hsa_system_get_info(HSA_SYSTEM_INFO_TIMESTAMP_FREQUENCY, &sysclock_hz); + hsa_status_t status = hsa_api_->hsa_system_get_info(HSA_SYSTEM_INFO_TIMESTAMP_FREQUENCY, &sysclock_hz); CHECK_STATUS("hsa_system_get_info(HSA_SYSTEM_INFO_TIMESTAMP_FREQUENCY)", status); sysclock_factor_ = (freq_t)1000000000 / (freq_t)sysclock_hz; } @@ -184,7 +193,7 @@ class HsaTimer { // Return timestamp in 'ns' timestamp_t timestamp_ns() const { timestamp_t sysclock; - hsa_status_t status = hsa_system_get_info(HSA_SYSTEM_INFO_TIMESTAMP, &sysclock); + hsa_status_t status = hsa_api_->hsa_system_get_info(HSA_SYSTEM_INFO_TIMESTAMP, &sysclock); CHECK_STATUS("hsa_system_get_info(HSA_SYSTEM_INFO_TIMESTAMP)", status); return sysclock_to_ns(sysclock); } @@ -192,6 +201,8 @@ class HsaTimer { private: // Timestamp frequency factor freq_t sysclock_factor_; + // HSA API table + const hsa_pfn_t* const hsa_api_; }; class HsaRsrcFactory { @@ -317,6 +328,11 @@ class HsaRsrcFactory { static uint64_t Submit(hsa_queue_t* queue, const void* packet); static uint64_t Submit(hsa_queue_t* queue, const void* packet, size_t size_bytes); + // Enable executables loading tracking + static bool IsExecutableTracking() { return executable_tracking_on_; } + static void EnableExecutableTracking(HsaApiTable* table); + static const char* GetKernelName(uint64_t addr); + // Initialize HSA API table void static InitHsaApiTable(HsaApiTable* table); static const hsa_pfn_t* HsaApi() { return &hsa_api_; } @@ -381,6 +397,13 @@ class HsaRsrcFactory { // System agents map std::map agent_map_; + // Executables loading tracking + typedef std::map symbols_map_t; + static symbols_map_t* symbols_map_; + static bool executable_tracking_on_; + static hsa_status_t hsa_executable_freeze_interceptor(hsa_executable_t executable, const char *options); + static hsa_status_t executable_symbols_cb(hsa_executable_t exec, hsa_executable_symbol_t symbol, void *data); + // HSA runtime API table static hsa_pfn_t hsa_api_; From 3eb4a67795904a51f80ed93ea55147954bce49a2 Mon Sep 17 00:00:00 2001 From: Evgeny Date: Thu, 19 Dec 2019 19:49:37 -0600 Subject: [PATCH 072/153] fixing stat file names eexpression --- bin/tblextr.py | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/bin/tblextr.py b/bin/tblextr.py index 490cdb8b..2c4e442b 100755 --- a/bin/tblextr.py +++ b/bin/tblextr.py @@ -446,6 +446,10 @@ def fill_ops_db(table_name, db, indir): statfile = re.sub(r'\.csv$', '.stats.csv', csvfile) jsonfile = re.sub(r'\.csv$', '.json', csvfile) + hsa_statfile = re.sub(r'\.stats\.csv$', r'.hsa_stats.csv', statfile) + hip_statfile = re.sub(r'\.stats\.csv$', r'.hip_stats.csv', statfile) + kfd_statfile = re.sub(r'\.stats\.csv$', r'.kfd_stats.csv', statfile) + with open(dbfile, mode='w') as fd: fd.truncate() db = SQLiteDB(dbfile) @@ -494,9 +498,8 @@ def fill_ops_db(table_name, db, indir): dform.gen_kernel_json_trace(db, 'A', GPU_BASE_PID, START_US, jsonfile) if hsa_trace_found: - statfile = re.sub(r'stats', r'hsa_stats', statfile) dform.post_process_data(db, 'HSA') - dform.gen_table_bins(db, 'HSA', statfile, 'Name', 'DurationNs') + dform.gen_table_bins(db, 'HSA', hsa_statfile, 'Name', 'DurationNs') dform.gen_api_json_trace(db, 'HSA', START_US, jsonfile) if hsa_activity_found: @@ -504,18 +507,16 @@ def fill_ops_db(table_name, db, indir): dform.gen_api_json_trace(db, 'COPY', START_US, jsonfile) if hip_trace_found: - statfile = re.sub(r'stats', r'hip_stats', statfile) dform.post_process_data(db, 'HIP') - dform.gen_table_bins(db, 'HIP', statfile, 'Name', 'DurationNs') + dform.gen_table_bins(db, 'HIP', hip_statfile, 'Name', 'DurationNs') dform.gen_api_json_trace(db, 'HIP', START_US, jsonfile) dform.post_process_data(db, 'OPS') dform.gen_ops_json_trace(db, 'OPS', GPU_BASE_PID, START_US, jsonfile) if kfd_trace_found: - statfile = re.sub(r'stats', r'kfd_stats', statfile) dform.post_process_data(db, 'KFD') - dform.gen_table_bins(db, 'KFD', statfile, 'Name', 'DurationNs') + dform.gen_table_bins(db, 'KFD', kfd_statfile, 'Name', 'DurationNs') dform.gen_api_json_trace(db, 'KFD', START_US, jsonfile) if any_trace_found: From 3667c0439a776b3bc84a0fe769176e9b9f1ffd8b Mon Sep 17 00:00:00 2001 From: eshcherb <33529668+eshcherb@users.noreply.github.com> Date: Fri, 20 Dec 2019 11:18:43 -0600 Subject: [PATCH 073/153] Update README.md --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 2f63df0b..9ed4010a 100644 --- a/README.md +++ b/README.md @@ -160,7 +160,7 @@ Options: See roctracer documentation on rocTX API details. Configuration file: - You can set your parameters defaults preferences in the configuration file 'rpl_rc.xml'. The search path sequence: .:/home/evgeny: + You can set your parameters defaults preferences in the configuration file 'rpl_rc.xml'. The search path sequence: .:$HOME: First the configuration file is looking in the current directory, then in your home, and then in the package directory. Configurable options: 'basenames', 'timestamp', 'ctx-limit', 'heartbeat'. An example of 'rpl_rc.xml': From fa28e6d677502db025e2fd7223afdb95242af97f Mon Sep 17 00:00:00 2001 From: Evgeny Date: Tue, 31 Dec 2019 06:16:33 -0600 Subject: [PATCH 074/153] spec update --- doc/rocprofiler_spec.md | 414 +++++++++++++++++++++++++++++++--------- 1 file changed, 326 insertions(+), 88 deletions(-) diff --git a/doc/rocprofiler_spec.md b/doc/rocprofiler_spec.md index a8ef7a2b..496595f1 100644 --- a/doc/rocprofiler_spec.md +++ b/doc/rocprofiler_spec.md @@ -64,7 +64,6 @@ To check the conformance of used library APi header and the library binary the v macros and API methods can be used. Returning the error and error string methods: -- rocprofiler_errno - method for returning the error number - rocprofiler_error_string - method for returning the error string Library version: @@ -114,12 +113,18 @@ The library provides back compatibility if the library major version is less or Returned API status: - hsa_status_t - HSA status codes are used from hsa.h header +Loadable plugin on-load/unload methods: +- rocprofiler_settings_t – global properties +- OnLoadTool +- OnLoadToolProp +- OnUnloadTool + Info API: - rocprofiler_info_kind_t - profiling info kind - rocprofiler_info_query_t - profiling info query - rocprofiler_info_data_t - profiling info data - rocprofiler_get_info - return the info for a given info kind -- rocprofiler_iterate_info - iterate over the info for a given info kind +- rocprofiler_iterote_inf_ - iterate over the info for a given info kind - rocprofiler_query_info - iterate over the info for a given info query Context API: @@ -137,6 +142,8 @@ Context API: - rocprofiler_get_group - return profiling group for a given index - rocprofiler_get_metrics - method for calculating the metrics data - rocprofiler_iterate_trace_data - method for iterating output trace data instances +- rocprofiler_time_id_t - supported time value ID enumeration +- rocprofiler_get_time – return time for a given time ID and profiling timestamp value Sampling API: - rocprofiler_start - start profiling @@ -152,10 +159,44 @@ Sampling API: Intercepting API: - rocprofiler_callback_t - profiling callback type - rocprofiler_callback_data_t - profiling callback data type +- rocprofiler_dispatch_record_t – dispatch record +- rocprofiler_queue_callbacks_t – queue callbacks, dispatch/destroy - rocprofiler_set_queue_callbacks - set queue kernel dispatch and queue destroy callbacks - rocprofiler_remove_queue_callbacks - remove queue callbacks + +Context pool API: +- rocprofiler_pool_t – context pool handle +- rocprofiler_pool_entry_t – context pool entry +- rocprofiler_pool_properties_t – context pool properties +- rocprofiler_pool_handler_t – context pool completion handler +- rocprofiler_pool_open - context pool open +- rocprofiler_pool_close - context pool close +- rocprofiler_pool_fetch – fetch and empty context entry to pool +- rocprofiler_pool_release – release a context entry +- rocprofiler_pool_iterate – iterated fetched context entries +- rocprofiler_pool_flush – flush completed context entries +``` +### 4.2. Loading and Configuring +``` +Loading and Configuring +The profiling properties can be set by profiler plugin on loading by ROC runtime. +The profiler library plugin can be set by ROCP_TOOL_LIB env var. + +Global properties: + +typedef struct { + uint32_t intercept_mode; + uint64_t timeout; + uint32_t timestamp_on; +} rocprofiler_settings_t; + +On load/unload methods defined in profiling tool library loaded by ROCP_TOOL_LIB env var: +extern "C" void OnLoadTool(); +extern "C" void OnLoadToolProp(rocprofiler_settings_t* settings); +extern "C" void OnUnloadTool(); + ``` -### 4.2. Info API +### 4.3. Info API ``` The profiling metrics are defined by name and the traces are defined by name and parameters. All supported features can be iterated using 'iterate_info/query_info' methods. The counter @@ -163,6 +204,7 @@ names are defined in counters table configuration file, each counter has a uniqu defined by block name and event id. The traces and trace parameters names are same as in the hardware documentation and the parameters codes are rocprofiler_feature_parameter_t values, see below in the "Context API" section. + Profiling info kind: typedef enum { @@ -220,7 +262,7 @@ has_status_t rocprofiler_query_info( hsa_status_t (*callback)(const rocprofiler_info_data_t info, void *data), // callback void *data); // data passed to callback ``` -### 4.3. Context API +### 4.4. Context API ``` Profiling context is accumulating all profiling information including profiling features which carry profiling data, required buffers for profiling command packets and output data. @@ -381,8 +423,22 @@ hsa_status_t rocprofiler_iterate_trace_data( hsa_ven_amd_aqlprofile_data_callback_t callback, // [in] callback to iterate // the output data void* callback_data); // [in/out] passed to callback data + +Converting of profiling timestamp to time value for suported time ID. +Supported time value ID enumeration: +typedef enum { + ROCPROFILER_TIME_ID_CLOCK_REALTIME = 0, // Linux realtime clock time + ROCPROFILER_TIME_ID_CLOCK_MONOTONIC = 1, // Linux monotonic clock time +} rocprofiler_time_id_t; + +Method for converting of profiling timestamp to time value for a given time ID: +hsa_status_t rocprofiler_get_time( + rocprofiler_time_id_t time_id, // identifier of the particular + // time to convert the timestamp + uint64_t timestamp, // profiling timestamp + uint64_t* value_ns); // [out] returned time ‘ns’ value ``` -### 4.4. Sampling API +### 4.5. Sampling API ``` The API supports the counters sampling usage model with start/read/stop methods and also lets to wait for the profiling data in the intercepting usage model with get_data method. @@ -423,7 +479,7 @@ hsa_status_t rocprofiler_group_read( hsa_status_t rocprofiler_group_get_data( rocprofiler_group_t* group); // [in/out] profiling group ``` -### 4.5. Intercepting API +### 4.6. Intercepting API ``` The library provides a callback API for enabling profiling for the kernels dispatched to HSA AQL queues. The API enables per-kernel profiling data collection. @@ -471,34 +527,101 @@ hsa_status_t rocprofiler_set_intercepting( hsa_status_t rocprofiler_remove_intercepting(); ``` +### 4.7. Profiling Context Pools +``` +The API provide capability to create a context pool for a given agent and a set of features, to fetch/release a context entry, to register a callback for pool’s contexts completion. +Profiling pool handle: +typename rocprofiler_pool_t; +Profiling pool entry: +typedef struct { + rocprofiler_t* context; // context object + void* payload; // payload data object +} rocprofiler_pool_entry_t; + +Profiling handler, calling on profiling completion: +typedef bool (*rocprofiler_pool_handler_t)(const rocprofiler_pool_entry_t* entry, void* arg); + +Profiling properties: +typedef struct { + uint32_t num_entries; // pool size entries + uint32_t payload_bytes; // payload size bytes + rocprofiler_pool_handler_t handler; // handler on context completion + void* handler_arg; // the handler arg +} rocprofiler_pool_properties_t; + +Open profiling pool: +hsa_status_t rocprofiler_pool_open( + hsa_agent_t agent, // GPU handle + rocprofiler_feature_t* features, // [in] profiling features array + uint32_t feature_count, // profiling info count + rocprofiler_pool_t** pool, // [out] context object + uint32_t mode, // profiling mode mask + rocprofiler_pool_properties_t*); // pool properties + +Close profiling pool: +hsa_status_t rocprofiler_pool_close( + rocprofiler_pool_t* pool); // profiling pool handle + +Fetch profiling pool entry: +hsa_status_t rocprofiler_pool_fetch( + rocprofiler_pool_t* pool, // profiling pool handle + rocprofiler_pool_entry_t* entry); // [out] empty profiling pool entry + +Release profiling pool entry: +hsa_status_t rocprofiler_pool_release( + rocprofiler_pool_entry_t* entry); // released profiling pool entry + +Iterate fetched profiling pool entries: +hsa_status_t rocprofiler_pool_iterate( + rocprofiler_pool_t* pool, // profiling pool handle + hsa_status_t (*callback)(rocprofiler_pool_entry_t* entry, void* data), + // callback + void *data); // [in/out] data passed to callback + +Flush completed entries in profiling pool: +hsa_status_t rocprofiler_pool_flush( + rocprofiler_pool_t* pool); // profiling pool handle +``` ## 5. Application code examples ### 5.1. Querying available metrics ``` Info data callback: - hsa_status_t info_data_callback(const rocprofiler_info_data_t info, void *data) { - switch (info.kind) { - case ROCPROFILER_INFO_KIND_METRIC: { - printf("metric %s, description %s\n", - info.metric.name, - info.metric.description); - break; - } - default: - printf("wrong info kind %u\n", kind); - return HSA_STATUS_ERROR; - } - return HSA_STATUS_SUCCESS; - } + hsa_status_t info_data_callback(const rocprofiler_info_data_t info, void *data) { + switch (info.kind) { + case ROCPROFILER_INFO_KIND_METRIC: { + if (info.metric.expr != NULL) { + fprintf(stdout, "Basic counter: gpu-agent%d : %s : %s\n", + info.agent_index, info.metric.name, info.metric.description); + fprintf(stdout, " %s = %s\n", info.metric.name, info.metric.expr); + } else { + fprintf(stdout, "Derived counter: gpu-agent%d : %s", + info.agent_index, info.metric.name); + if (info.metric.instances > 1) { + fprintf(stdout, "[0-%u]", info.metric.instances - 1); + } + fprintf(stdout, " : %s\n", info.metric.description); + fprintf(stdout, " block %s has %u counters\n", + info.metric.block_name, info.metric.block_counters); + } + fflush(stdout); + break; + } + default: + printf("wrong info kind %u\n", kind); + return HSA_STATUS_ERROR; + } + return HSA_STATUS_SUCCESS; + } Printing all available metrics: - hsa_status_t status = rocprofiler_iterate_info( - agent, - ROCPROFILER_INFO_KIND_METRIC, - info_data_callback, - NULL); - + hsa_status_t status = rocprofiler_iterate_info( + agent, + ROCPROFILER_INFO_KIND_METRIC, + info_data_callback, + NULL); + ``` ### 5.2. Profiling code example ``` @@ -509,85 +632,200 @@ saved and then direct context method rocprofiler_get_data with default group ind can be used. hsa_status_t_dispatch_callback( - const rocprofiler_callback_data_t* callback_data, - void* user_data, - rocprofiler_group_t* group) + const rocprofiler_callback_data_t* callback_data, + void* user_data, + rocprofiler_group_t* group) { - hsa_status_t status = HSA_STATUS_SUCCESS; - // Profiling context - rocprofiler_t* context; - // Profiling info objects - rocprofiler_feature_t features* = new rocprofiler_feature_t[2]; - // Tracing parameters - rocprofiler_feature_parameter_t* parameters = new rocprofiler_feature_parameter_t[2]; - - // Setting profiling features - features[0].type = ROCPROFILER_METRIC; - features[0].name = "L1_MISS_RATIO"; - features[1].type = ROCPROFILER_METRIC; - features[1].name = "DRAM_BANDWIDTH"; - - // Creating profiling context - status = rocprofiler_open(callback_data->dispatch.agent, features, 2, &context, - ROCPROFILER_MODE_SINGLEGROUP, NULL); - - - // Get the profiling group - // For general case with many groups there is rocprofiler_group_count() API - const uint32_t group_index = 0 - status = rocprofiler_get_group(context, group_index, group); - - - // In SINGLEGROUP mode the context handle itself can be saved, because there is just one group - - - return status; + hsa_status_t status = HSA_STATUS_SUCCESS; + // Profiling context + rocprofiler_t* context; + // Profiling info objects + rocprofiler_feature_t features* = new rocprofiler_feature_t[2]; + // Tracing parameters + rocprofiler_feature_parameter_t* parameters = new rocprofiler_feature_parameter_t[2]; + + // Setting profiling features + features[0].type = ROCPROFILER_METRIC; + features[0].name = "L1_MISS_RATIO"; + features[1].type = ROCPROFILER_METRIC; + features[1].name = "DRAM_BANDWIDTH"; + + // Creating profiling context + status = rocprofiler_open(callback_data->dispatch.agent, features, 2, &context, + ROCPROFILER_MODE_SINGLEGROUP, NULL); + + + // Get the profiling group + // For general case with many groups there is rocprofiler_group_count() API + const uint32_t group_index = 0 + status = rocprofiler_get_group(context, group_index, group); + + + // In SINGLEGROUP mode the context handle itself can be saved, because there is just one group + + + return status; } +Profiling tool constructor is adding the dispatch callback: + void profiling_libary_constructor() { - // Defining callback data, no data in this simple example - void* callback_data = NULL; + // Defining callback data, no data in this simple example + void* callback_data = NULL; - // Adding observers - hsa_sttaus_t status = rocprofiler_add_dispatch_callback(dispatch_callback, callback_data); - + // Adding observers + hsa_sttaus_t status = rocprofiler_add_dispatch_callback(dispatch_callback, callback_data); + - // Dispatching profiled kernel - + // Dispatching profiled kernel + } void profiling_libary_destructor() { - > { - // In SINGLEGROUP mode the rocprofiler_get_group() method with default zero group - // index can be used, if context handle would be saved - status = rocprofiler_group_get_data(entry->group); - - status = rocprofiler_get_metrics(entry->group->context); - - status = rocprofiler_close(entry->group->context); - - - dispatch_data, entry->features, entry->features_count)>; - } + > { + // In SINGLEGROUP mode the rocprofiler_get_group() method with default zero group + // index can be used, if context handle would be saved + status = rocprofiler_group_get_data(entry->group); + + status = rocprofiler_get_metrics(entry->group->context); + + status = rocprofiler_close(entry->group->context); + + + dispatch_data, entry->features, entry->features_count)>; + } } ``` ### 5.3. Option to use completion callback ``` Creating profiling context with completion callback: - . . . - rocprofiler_properties_t properties = {}; - properties.callback = completion_callback; - properties.callback_arg = NULL; // no args defined - status = rocprofiler_open(agent, features, 3, &context, - ROCPROFILER_MODE_SINGLEGROUP, properties); - - . . . + . . . + rocprofiler_properties_t properties = {}; + properties.callback = completion_callback; + properties.callback_arg = NULL; // no args defined + status = rocprofiler_open(agent, features, 3, &context, + ROCPROFILER_MODE_SINGLEGROUP, properties); + + . . . Definition of completion callback: void completion_callback(profiler_group_t group, void* arg) { - - hsa_status_t status = rocprofiler_close(group.context); - + + hsa_status_t status = rocprofiler_close(group.context); + +} +``` +### 5.4. Option to Use Context Pool +``` +Code example of context pool usage. +Creating profiling contexts pool: + . . . + rocprofiler_pool_properties_t properties{}; + properties.num_entries = 100; + properties.payload_bytes = sizeof(context_entry_t); + properties.handler = context_handler; + properties.handler_arg = handler_arg; + status = rocprofiler_pool_open(agent, features, 3, &context, + ROCPROFILER_MODE_SINGLEGROUP, properties); + + . . . + +Fetching a context entry: + rocprofiler_pool_entry_t pool_entry{}; + status = rocprofiler_pool_fetch(pool, &pool_entry); + + // Profiling context entry + rocprofiler_t* context = pool_entry.context; + context_entry_t* entry = reinterpret_cast + (pool_entry.payload); +``` +### 5.5. Standalone Sampling Usage Code Example +``` +The profiling metrics are being read from separate standalone queue other than the application kernels are submitted to. To enable the sampling mode, the profiling mode in all user queues should be enabled. It can be done by loading ROC-profiler library to HSA runtime using the environment variable HSA_TOOLS_LIB for all shell sessions. + // Sampling rate + uint32_t sampling_rate = ; + // Sampling count + uint32_t sampling_count = ; + // HSA status + hsa_status_t status = HSA_STATUS_ERROR; + // HSA agent + hsa_agent_t agent; + // Profiling context + rocprofiler_t* context = NULL; + // Profiling properties + rocprofiler_properties_t properties; + + // Getting HSA agent + + + // Profiling feature objects + const unsigned feature_count = 2; + rocprofiler_feature_t feature[feature_count]; + + // Counters and metrics + feature[0].kind = ROCPROFILER_FEATURE_KIND_METRIC; + feature[0].name = "GPUBusy"; + feature[1].kind = ROCPROFILER_FEATURE_KIND_METRIC; + feature[1].name = "SQ_WAVES"; + + // Creating profiling context with standalone queue + properties = {}; + properties.queue_depth = 128; + status = rocprofiler_open(agent, feature, feature_count, &context, + ROCPROFILER_MODE_STANDALONE| ROCPROFILER_MODE_CREATEQUEUE| + ROCPROFILER_MODE_SINGLEGROUP, &properties); + + + // Start counters and sample them in the loop with the sampling rate + status = rocprofiler_start(context, 0); + + + for (unsigned ind = 0; ind < sampling_count; ++ind) { + sleep(sampling_rate); + status = rocprofiler_read(context, 0); + + status = rocprofiler_get_data(context, 0); + + status = rocprofiler_get_metrics(context); + + print_results(feature, feature_count); + } + + // Stop counters + status = rocprofiler_stop(context, group_n); + + + // Finishing cleanup + // Deleting profiling context will delete all allocated resources + status = rocprofiler_close(context); + +``` +### 5.6. Printing Out Profiling Results +``` +Below is a code example for printing out the profiling results from profiling features array: +void print_results(rocprofiler_feature_t* feature, uint32_t feature_count) { + for (rocprofiler_feature_t* p = feature; p < feature + feature_count; ++p) + { + std::cout << (p - feature) << ": " << p->name; + switch (p->data.kind) { + case ROCPROFILER_DATA_KIND_INT64: + std::cout << " result_int64 (" << p->data.result_int64 << ")" + << std::endl; + break; + + case ROCPROFILER_DATA_KIND_BYTES: { + std::cout << " result_bytes ptr(" << p->data.result_bytes.ptr << + ") " << " size(" << p->data.result_bytes.size << ")" + << " instance_count(" << p->data.result_bytes.instance_count + << ")"; + break; + } + default: + std::cout << "bad result kind (" << p->data.kind << ")" + << std::endl; + + } + } } ``` From ca823d250b17b8b5d3989a29673b3db451337901 Mon Sep 17 00:00:00 2001 From: eshcherb <33529668+eshcherb@users.noreply.github.com> Date: Tue, 31 Dec 2019 06:20:30 -0600 Subject: [PATCH 075/153] Update rocprofiler_spec.md --- doc/rocprofiler_spec.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/rocprofiler_spec.md b/doc/rocprofiler_spec.md index 496595f1..57bf2739 100644 --- a/doc/rocprofiler_spec.md +++ b/doc/rocprofiler_spec.md @@ -113,7 +113,7 @@ The library provides back compatibility if the library major version is less or Returned API status: - hsa_status_t - HSA status codes are used from hsa.h header -Loadable plugin on-load/unload methods: +Loading and Configuring, loadable plugin on-load/unload methods: - rocprofiler_settings_t – global properties - OnLoadTool - OnLoadToolProp From d93e1b447adec157321ab47ef8539fdc8234c46f Mon Sep 17 00:00:00 2001 From: eshcherb <33529668+eshcherb@users.noreply.github.com> Date: Tue, 31 Dec 2019 07:26:27 -0600 Subject: [PATCH 076/153] Update README.md --- README.md | 3 +++ 1 file changed, 3 insertions(+) diff --git a/README.md b/README.md index 9ed4010a..c1b7692f 100644 --- a/README.md +++ b/README.md @@ -4,6 +4,9 @@ ROC profiler library. Profiling with perf-counters and derived metrics. Library HW specific low-level performance analysis interface for profiling of GPU compute applications. The profiling includes HW performance counters with complex performance metrics. +['rocprof' cmdline tool specification](doc/rocprof.md) +[API specification](doc/rocprofiler_spec.md) + ## Metrics [The link to profiler default metrics XML specification](test/tool/metrics.xml) From 64bd4dec0e34ce59df8ba213593d11e17cc9547e Mon Sep 17 00:00:00 2001 From: eshcherb <33529668+eshcherb@users.noreply.github.com> Date: Tue, 31 Dec 2019 07:27:17 -0600 Subject: [PATCH 077/153] Update README.md --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index c1b7692f..52c5a27e 100644 --- a/README.md +++ b/README.md @@ -5,6 +5,7 @@ HW specific low-level performance analysis interface for profiling of GPU comput profiling includes HW performance counters with complex performance metrics. ['rocprof' cmdline tool specification](doc/rocprof.md) + [API specification](doc/rocprofiler_spec.md) ## Metrics From 95a4317bc0f61be8a64c4d4104cedb2dc7c9e210 Mon Sep 17 00:00:00 2001 From: Srinivasan Subramanian Date: Wed, 18 Sep 2019 18:26:18 -0700 Subject: [PATCH 078/153] multiple rocm version support, remove shared library conflicts Change-Id: Ic618c90be4c6274b4c6fbc43e46c321d60fe1c28 --- CMakeLists.txt | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 8b81b5a6..edc30d1a 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -22,6 +22,9 @@ cmake_minimum_required ( VERSION 2.8.12 ) +# Install prefix +set(CMAKE_INSTALL_PREFIX "/opt/rocm" CACHE PATH "Install path prefix default") + ## Verbose output. set ( CMAKE_VERBOSE_MAKEFILE TRUE CACHE BOOL "Verbose Output" FORCE ) @@ -45,13 +48,22 @@ message ( "-- LIB-VERSION: ${VERSION_MAJOR}.${VERSION_MINOR}.${VERSION_PATCH}" ) set ( BUILD_VERSION_MAJOR ${VERSION_MAJOR} ) set ( BUILD_VERSION_MINOR ${VERSION_MINOR} ) set ( BUILD_VERSION_PATCH ${VERSION_PATCH} ) -set ( LIB_VERSION_STRING "${BUILD_VERSION_MAJOR}.${BUILD_VERSION_MINOR}.${BUILD_VERSION_PATCH}" ) if ( DEFINED VERSION_BUILD AND NOT ${VERSION_BUILD} STREQUAL "" ) message ( "VERSION BUILD DEFINED ${VERSION_BUILD}" ) set ( BUILD_VERSION_PATCH "${BUILD_VERSION_PATCH}-${VERSION_BUILD}" ) endif () set ( BUILD_VERSION_STRING "${BUILD_VERSION_MAJOR}.${BUILD_VERSION_MINOR}.${BUILD_VERSION_PATCH}" ) +set ( LIB_VERSION_MAJOR ${VERSION_MAJOR} ) +set ( LIB_VERSION_MINOR ${VERSION_MINOR} ) +if ( ${ROCM_PATCH_VERSION} ) + set ( LIB_VERSION_PATCH ${ROCM_PATCH_VERSION} ) +else() + set ( LIB_VERSION_PATCH ${VERSION_PATCH} ) +endif() +set ( LIB_VERSION_STRING "${LIB_VERSION_MAJOR}.${LIB_VERSION_MINOR}.${LIB_VERSION_PATCH}" ) +message ( "-- LIB-VERSION STRING: ${LIB_VERSION_STRING}" ) + ## Set target and root/lib/test directory set ( TARGET_NAME "${ROCPROFILER_TARGET}" ) set ( ROOT_DIR "${CMAKE_CURRENT_SOURCE_DIR}" ) @@ -63,7 +75,7 @@ include ( ${LIB_DIR}/CMakeLists.txt ) ## Set the VERSION and SOVERSION values set_property ( TARGET ${TARGET_NAME} PROPERTY VERSION "${LIB_VERSION_STRING}" ) -set_property ( TARGET ${TARGET_NAME} PROPERTY SOVERSION "${BUILD_VERSION_MAJOR}" ) +set_property ( TARGET ${TARGET_NAME} PROPERTY SOVERSION "${LIB_VERSION_MAJOR}" ) ## If the library is a release, strip the target library if ( "${CMAKE_BUILD_TYPE}" STREQUAL release ) From 6cc11fa0c5498ae9a8ffbb3c5866dc103d4da2d3 Mon Sep 17 00:00:00 2001 From: eshcherb <33529668+eshcherb@users.noreply.github.com> Date: Mon, 6 Jan 2020 13:37:10 -0600 Subject: [PATCH 079/153] Update README.md --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 52c5a27e..50b61cf7 100644 --- a/README.md +++ b/README.md @@ -6,7 +6,7 @@ profiling includes HW performance counters with complex performance metrics. ['rocprof' cmdline tool specification](doc/rocprof.md) -[API specification](doc/rocprofiler_spec.md) +['rocprofiler' profiling C API specification](doc/rocprofiler_spec.md) ## Metrics [The link to profiler default metrics XML specification](test/tool/metrics.xml) From 2904d293d704f5399f95b773daf908746d9c28e4 Mon Sep 17 00:00:00 2001 From: eshcherb <33529668+eshcherb@users.noreply.github.com> Date: Mon, 6 Jan 2020 13:38:02 -0600 Subject: [PATCH 080/153] Update README.md --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index 50b61cf7..b85f5084 100644 --- a/README.md +++ b/README.md @@ -4,6 +4,7 @@ ROC profiler library. Profiling with perf-counters and derived metrics. Library HW specific low-level performance analysis interface for profiling of GPU compute applications. The profiling includes HW performance counters with complex performance metrics. +## Documentation ['rocprof' cmdline tool specification](doc/rocprof.md) ['rocprofiler' profiling C API specification](doc/rocprofiler_spec.md) From 145232e0def29a20a5d405a0a229c20f621c0319 Mon Sep 17 00:00:00 2001 From: eshcherb <33529668+eshcherb@users.noreply.github.com> Date: Mon, 6 Jan 2020 13:43:51 -0600 Subject: [PATCH 081/153] Update README.md --- README.md | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index b85f5084..331a6ffc 100644 --- a/README.md +++ b/README.md @@ -5,9 +5,8 @@ HW specific low-level performance analysis interface for profiling of GPU comput profiling includes HW performance counters with complex performance metrics. ## Documentation -['rocprof' cmdline tool specification](doc/rocprof.md) - -['rocprofiler' profiling C API specification](doc/rocprofiler_spec.md) +- ['rocprof' cmdline tool specification](doc/rocprof.md) +- ['rocprofiler' profiling C API specification](doc/rocprofiler_spec.md) ## Metrics [The link to profiler default metrics XML specification](test/tool/metrics.xml) From 628051723ed4f757c2615dae32d9480a529214ae Mon Sep 17 00:00:00 2001 From: Evgeny Date: Tue, 7 Jan 2020 11:11:52 -0600 Subject: [PATCH 082/153] adding: get_time() API; C test; --- bin/rpl_run.sh | 8 +++- inc/rocprofiler.h | 14 ++++++- src/core/rocprofiler.cpp | 9 ++++ src/util/hsa_rsrc_factory.cpp | 5 +++ src/util/hsa_rsrc_factory.h | 79 +++++++++++++++++++++++++++++++++++ test/CMakeLists.txt | 10 +++-- test/app/c_test.c | 25 +++++++++++ test/run.sh | 21 +++++----- 8 files changed, 155 insertions(+), 16 deletions(-) create mode 100644 test/app/c_test.c diff --git a/bin/rpl_run.sh b/bin/rpl_run.sh index 86383d14..ce492e81 100755 --- a/bin/rpl_run.sh +++ b/bin/rpl_run.sh @@ -179,6 +179,7 @@ usage() { echo " " echo " " echo "" + echo " --trace-start - to enable tracing on start [on]" echo " --trace-period - to enable trace with initial delay, with periodic sample length and rate" echo " Supported time formats: " echo " --obj-tracking - to turn on/off kernels code objects tracking [off]" @@ -232,12 +233,13 @@ run() { fi API_TRACE="" + LD_PRELOAD="" if [ "$ROCTX_TRACE" = 1 ] ; then API_TRACE=${API_TRACE}":roctx" fi if [ "$KFD_TRACE" = 1 ] ; then API_TRACE=${API_TRACE}":kfd" - export LD_PRELOAD="libkfdwrapper64.so libhsakmt.so.1" + export LD_PRELOAD="libkfdwrapper64.so libhsakmt.so.1 $LD_PRELOAD" fi if [ "$HIP_TRACE" = 1 ] ; then API_TRACE=${API_TRACE}":hip" @@ -392,6 +394,10 @@ while [ 1 ] ; do export ROCP_TIMESTAMP_ON=1 GEN_STATS=1 HIP_TRACE=1 + elif [ "$1" = "--trace-start" ] ; then + if [ "$2" = "off" ] ; then + export ROCP_CTRL_RATE="-1" + fi elif [ "$1" = "--trace-period" ] ; then period_expr="^\([^:]*\):\([^:]*\):\([^:]*\)$" period_ck=`echo "$2" | sed -n "s/"${period_expr}"/ok/p"` diff --git a/inc/rocprofiler.h b/inc/rocprofiler.h index 313f7f42..31082cf4 100644 --- a/inc/rocprofiler.h +++ b/inc/rocprofiler.h @@ -199,6 +199,18 @@ hsa_status_t rocprofiler_close(rocprofiler_t* context); // [in] profiling conte hsa_status_t rocprofiler_reset(rocprofiler_t* context, // [in] profiling context uint32_t group_index); // group index +// Supported time value ID +typedef enum { + ROCPROFILER_TIME_ID_CLOCK_REALTIME = 0, // Linux realtime clock time + ROCPROFILER_TIME_ID_CLOCK_MONOTONIC = 1, // Linux monotonic clock time +} rocprofiler_time_id_t; + +// Return time value for a given time ID and profiling timestamp +hsa_status_t rocprofiler_get_time( + rocprofiler_time_id_t time_id, // identifier of the particular time to convert the timesatmp + uint64_t timestamp, // profiling timestamp + uint64_t* value_ns); // [out] returned time 'ns' value + //////////////////////////////////////////////////////////////////////////////// // Queue callbacks // @@ -376,7 +388,7 @@ hsa_status_t rocprofiler_query_info( hsa_status_t (*callback)(const rocprofiler_info_data_t info, void *data), // callback void *data); // [in/out] data passed to callback -// Creates a profiled queue. All dispatches on this queue will be profiled +// Create a profiled queue. All dispatches on this queue will be profiled hsa_status_t rocprofiler_queue_create_profiled( hsa_agent_t agent_handle,uint32_t size, hsa_queue_type32_t type, void (*callback)(hsa_status_t status, hsa_queue_t* source, void* data), diff --git a/src/core/rocprofiler.cpp b/src/core/rocprofiler.cpp index 61fd4619..3f1362a7 100644 --- a/src/core/rocprofiler.cpp +++ b/src/core/rocprofiler.cpp @@ -843,4 +843,13 @@ PUBLIC_API hsa_status_t rocprofiler_queue_create_profiled( return rocprofiler::InterceptQueue::QueueCreateTracked(agent, size, type, callback, data, private_segment_size, group_segment_size, queue); } +// Return time value for a given time ID and profiling timestamp +hsa_status_t rocprofiler_get_time( + rocprofiler_time_id_t time_id, + uint64_t timestamp, + uint64_t* value_ns) +{ + return rocprofiler::util::HsaRsrcFactory::Instance().GetTime(time_id, timestamp, value_ns); +} + } // extern "C" diff --git a/src/util/hsa_rsrc_factory.cpp b/src/util/hsa_rsrc_factory.cpp index 65f94357..9ce362d4 100644 --- a/src/util/hsa_rsrc_factory.cpp +++ b/src/util/hsa_rsrc_factory.cpp @@ -150,6 +150,11 @@ HsaRsrcFactory::HsaRsrcFactory(bool initialize_hsa) : initialize_hsa_(initialize CHECK_STATUS("HSA timer allocation failed", (timer_ == NULL) ? HSA_STATUS_ERROR : HSA_STATUS_SUCCESS); + // Time correlation + const uint32_t corr_iters = 1000; + CorrelateTime(HsaTimer::TIME_ID_CLOCK_REALTIME, corr_iters); + CorrelateTime(HsaTimer::TIME_ID_CLOCK_MONOTONIC, corr_iters); + // System timeout timeout_ = (timeout_ns_ == HsaTimer::TIMESTAMP_MAX) ? timeout_ns_ : timer_->ns_to_sysclock(timeout_ns_); } diff --git a/src/util/hsa_rsrc_factory.h b/src/util/hsa_rsrc_factory.h index bf7f5fcf..0362bc2c 100644 --- a/src/util/hsa_rsrc_factory.h +++ b/src/util/hsa_rsrc_factory.h @@ -35,6 +35,7 @@ POSSIBILITY OF SUCH DAMAGE. #include #include #include +#include #include #include @@ -177,6 +178,12 @@ class HsaTimer { static const timestamp_t TIMESTAMP_MAX = UINT64_MAX; typedef long double freq_t; + enum time_id_t { + TIME_ID_CLOCK_REALTIME = 0, + TIME_ID_CLOCK_MONOTONIC = 1, + TIME_ID_NUMBER + }; + HsaTimer(const hsa_pfn_t* hsa_api) : hsa_api_(hsa_api) { timestamp_t sysclock_hz = 0; hsa_status_t status = hsa_api_->hsa_system_get_info(HSA_SYSTEM_INFO_TIMESTAMP_FREQUENCY, &sysclock_hz); @@ -192,6 +199,11 @@ class HsaTimer { return timestamp_t((freq_t)time / sysclock_factor_); } + // Method for timespec/ns conversion + timestamp_t timespec_to_ns(const timespec& time) const { + return ((timestamp_t)time.tv_sec * 1000000000) + time.tv_nsec; + } + // Return timestamp in 'ns' timestamp_t timestamp_ns() const { timestamp_t sysclock; @@ -200,6 +212,54 @@ class HsaTimer { return sysclock_to_ns(sysclock); } + // Return time in 'ns' + timestamp_t clocktime_ns(clockid_t clock_id) const { + timespec time; + clock_gettime(clock_id, &time); + return timespec_to_ns(time); + } + + // Return pair of correlated values of profiling timestamp and time with + // correlation error for a given time ID and number of iterations + void correlated_pair_ns(time_id_t time_id, uint32_t iters, + timestamp_t* timestamp_v, timestamp_t* time_v, timestamp_t* error_v) { + clockid_t clock_id = 0; + switch (clock_id) { + case TIME_ID_CLOCK_REALTIME: + clock_id = CLOCK_REALTIME; + break; + case TIME_ID_CLOCK_MONOTONIC: + clock_id = CLOCK_MONOTONIC; + break; + default: + CHECK_STATUS("internal error: invalid time_id", HSA_STATUS_ERROR); + } + + std::vector ts_vec(iters); + std::vector tm_vec(iters); + const uint32_t steps = iters - 1; + + for (uint32_t i = 0; i < iters; ++i) { + hsa_api_->hsa_system_get_info(HSA_SYSTEM_INFO_TIMESTAMP, &ts_vec[i]); + clock_gettime(clock_id, &tm_vec[i]); + } + + const timestamp_t ts_base = sysclock_to_ns(ts_vec.front()); + const timestamp_t tm_base = timespec_to_ns(tm_vec.front()); + const timestamp_t error = (ts_vec.back() - ts_vec.front()) / (2 * steps); + + timestamp_t ts_accum = 0; + timestamp_t tm_accum = 0; + for (uint32_t i = 0; i < iters; ++i) { + ts_accum += (ts_vec[i] - ts_base); + tm_accum += (timespec_to_ns(tm_vec[i]) - tm_base); + } + + *timestamp_v = (ts_accum / iters) + ts_base + error; + *time_v = (tm_accum / iters) + tm_base; + *error_v = error; + } + private: // Timestamp frequency factor freq_t sysclock_factor_; @@ -359,6 +419,21 @@ class HsaRsrcFactory { if (instance_ != NULL) Instance().timeout_ = Instance().timer_->ns_to_sysclock(time); } + void CorrelateTime(HsaTimer::time_id_t time_id, uint32_t iters) { + timestamp_t timestamp_v = 0; + timestamp_t time_v = 0; + timestamp_t error_v = 0; + timer_->correlated_pair_ns(time_id, iters, ×tamp_v, &time_v, &error_v); + time_shift_[time_id] = time_v - timestamp_v; + time_error_[time_id] = error_v; + } + + hsa_status_t GetTime(uint32_t time_id, uint64_t value, uint64_t* time) { + if (time_id >= HsaTimer::TIME_ID_NUMBER) return HSA_STATUS_ERROR; + *time = value + time_shift_[time_id]; + return HSA_STATUS_SUCCESS; + } + private: // System agents iterating callback static hsa_status_t GetHsaAgentsCallback(hsa_agent_t agent, void* data); @@ -423,6 +498,10 @@ class HsaRsrcFactory { // HSA timer HsaTimer* timer_; + // Time shift array to support time conversion + timestamp_t time_shift_[HsaTimer::TIME_ID_NUMBER]; + timestamp_t time_error_[HsaTimer::TIME_ID_NUMBER]; + // CPU/kern-arg memory pools hsa_amd_memory_pool_t *cpu_pool_; hsa_amd_memory_pool_t *kern_arg_pool_; diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index 9212f2af..1ae8a554 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -32,6 +32,10 @@ if ( NOT DEFINED TEST_DIR ) include ( env ) endif () +## C test +add_executable ( "c_test" ${TEST_DIR}/app/c_test.c ) +target_include_directories ( "c_test" PRIVATE ${ROOT_DIR} ${HSA_RUNTIME_INC_PATH} ) + ## Util sources file( GLOB UTIL_SRC "${TEST_DIR}/util/*.cpp" ) @@ -67,17 +71,17 @@ execute_process ( COMMAND sh -xc "${TEST_DIR}/../bin/build_kernel.sh ${TEST_DIR} ## Building standalone test executable add_executable ( ${STEXE_NAME} ${STTST_SRC} ${UTIL_SRC} ${KERN_SRC} ) target_include_directories ( ${STEXE_NAME} PRIVATE ${TEST_DIR} ${ROOT_DIR} ${HSA_RUNTIME_INC_PATH} ) -target_link_libraries( ${STEXE_NAME} ${ROCPROFILER_TARGET} ${HSA_RUNTIME_LIB} ${HSA_KMT_LIB} c stdc++ dl pthread rt ) +target_link_libraries ( ${STEXE_NAME} ${ROCPROFILER_TARGET} ${HSA_RUNTIME_LIB} ${HSA_KMT_LIB} c stdc++ dl pthread rt ) ## Building intercept test executable add_library ( ${INEXE_NAME} SHARED ${INTST_SRC} ${UTIL_SRC} ${KERN_SRC} ) target_include_directories ( ${INEXE_NAME} PRIVATE ${TEST_DIR} ${ROOT_DIR} ${HSA_RUNTIME_INC_PATH} ) -target_link_libraries( ${INEXE_NAME} ${ROCPROFILER_TARGET} ${HSA_RUNTIME_LIB} ${HSA_KMT_LIB} c stdc++ dl pthread rt ) +target_link_libraries ( ${INEXE_NAME} ${ROCPROFILER_TARGET} ${HSA_RUNTIME_LIB} ${HSA_KMT_LIB} c stdc++ dl pthread rt ) ## Building ctrl test executable add_executable ( ${EXE_NAME} ${CTRL_SRC} ${UTIL_SRC} ${KERN_SRC} ) target_include_directories ( ${EXE_NAME} PRIVATE ${TEST_DIR} ${ROOT_DIR} ${HSA_RUNTIME_INC_PATH} ) -target_link_libraries( ${EXE_NAME} ${HSA_RUNTIME_LIB} ${HSA_KMT_LIB} c stdc++ dl pthread rt ) +target_link_libraries ( ${EXE_NAME} ${HSA_RUNTIME_LIB} ${HSA_KMT_LIB} c stdc++ dl pthread rt ) execute_process ( COMMAND sh -xc "cp ${TEST_DIR}/run.sh ${PROJECT_BINARY_DIR}" ) execute_process ( COMMAND sh -xc "cp ${TEST_DIR}/tool/*.xml ${PROJECT_BINARY_DIR}" ) execute_process ( COMMAND sh -xc "mkdir -p ${PROJECT_BINARY_DIR}/RESULTS" ) diff --git a/test/app/c_test.c b/test/app/c_test.c new file mode 100644 index 00000000..70c6d306 --- /dev/null +++ b/test/app/c_test.c @@ -0,0 +1,25 @@ +/****************************************************************************** +Copyright (c) 2018 Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*******************************************************************************/ + +#include "inc/rocprofiler.h" +const int ret = 0; +int main() { return ret; } diff --git a/test/run.sh b/test/run.sh index dfd15e34..4612fa1c 100755 --- a/test/run.sh +++ b/test/run.sh @@ -56,8 +56,6 @@ eval_test() { export HSA_TOOLS_REPORT_LOAD_FAILURE=1 # paths to ROC profiler and oher libraries export LD_LIBRARY_PATH=$PWD -# ROC profiler library loaded by HSA runtime -export HSA_TOOLS_LIB=librocprofiler64.so # enable error messages logging to '/tmp/rocprofiler_log.txt' export ROCPROFILER_LOG=1 # ROC profiler metrics config file @@ -67,8 +65,17 @@ export ROCP_METRICS=metrics.xml # test trace export ROC_TEST_TRACE=1 -## Intercepting usage model test +## C test +eval_test "C test" ./test/c_test +## Standalone sampling usage model test +unset HSA_TOOLS_LIB +unset ROCP_TOOL_LIB +eval_test "Standalone sampling usage model test" ./test/standalone_test + +## Intercepting usage model test +# ROC profiler library loaded by HSA runtime +export HSA_TOOLS_LIB=librocprofiler64.so # tool library loaded by ROC profiler export ROCP_TOOL_LIB=./test/libintercept_test.so export ROCP_KITER=50 @@ -77,13 +84,7 @@ export ROCP_AGENTS=1 export ROCP_THRS=3 eval_test "Intercepting usage model test" "../bin/run_tool.sh ./test/ctrl" -## Standalone sampling usage model test - -unset ROCP_TOOL_LIB -eval_test "Standalone sampling usage model test" ./test/standalone_test - ## Libtool test - # tool library loaded by ROC profiler export ROCP_TOOL_LIB=libtool.so # ROC profiler kernels timing @@ -111,7 +112,6 @@ export ROCP_INPUT=input1.xml eval_test "'rocprof' libtool test n-threads" ./test/ctrl ## Libtool test, counter sets - # Memcopies tracking export ROCP_MCOPY_TRACKING=1 @@ -121,7 +121,6 @@ export ROCP_INPUT=input2.xml eval_test "libtool test, counter sets" ./test/ctrl ## OpenCL test - export ROCP_OBJ_TRACKING=1 export ROCP_INPUT=input1.xml eval_test "libtool test, OpenCL sample" ./test/ocl/SimpleConvolution From 294cb95794c0222af8c5c6dd01475cabb3d8cbf9 Mon Sep 17 00:00:00 2001 From: eshcherb <33529668+eshcherb@users.noreply.github.com> Date: Tue, 7 Jan 2020 13:10:22 -0600 Subject: [PATCH 083/153] Update README.md --- README.md | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/README.md b/README.md index 331a6ffc..f7aacd06 100644 --- a/README.md +++ b/README.md @@ -40,6 +40,11 @@ profiling includes HW performance counters with complex performance metrics. ## To build with the current installed ROCM: ``` + - Python2.7 is required. + The required modules: CppHeaderParser, argparse. + To install: + sudo pip install CppHeaderParser argparse + - To build and install to /opt/rocm/rocprofiler export CMAKE_PREFIX_PATH=/opt/rocm/include/hsa:/opt/rocm From 5c8dea1a6fdf7085fe5bc04e57fa8dd340156a81 Mon Sep 17 00:00:00 2001 From: eshcherb <33529668+eshcherb@users.noreply.github.com> Date: Tue, 7 Jan 2020 13:12:19 -0600 Subject: [PATCH 084/153] Update README.md --- README.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/README.md b/README.md index f7aacd06..85a51619 100644 --- a/README.md +++ b/README.md @@ -40,6 +40,8 @@ profiling includes HW performance counters with complex performance metrics. ## To build with the current installed ROCM: ``` + - ROCm is required. + - Python2.7 is required. The required modules: CppHeaderParser, argparse. To install: From c69d8367c2eaf172591895a4708bfd60d248c850 Mon Sep 17 00:00:00 2001 From: eshcherb <33529668+eshcherb@users.noreply.github.com> Date: Tue, 7 Jan 2020 13:12:51 -0600 Subject: [PATCH 085/153] Update README.md --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 85a51619..0e065952 100644 --- a/README.md +++ b/README.md @@ -40,9 +40,9 @@ profiling includes HW performance counters with complex performance metrics. ## To build with the current installed ROCM: ``` - - ROCm is required. + - ROCm is required. - - Python2.7 is required. + - Python2.7 is required. The required modules: CppHeaderParser, argparse. To install: sudo pip install CppHeaderParser argparse From 2c262c3efdd70929757b760d1967a46312ba004c Mon Sep 17 00:00:00 2001 From: eshcherb <33529668+eshcherb@users.noreply.github.com> Date: Tue, 7 Jan 2020 16:35:52 -0600 Subject: [PATCH 086/153] Update README.md --- README.md | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 0e065952..8281f6e1 100644 --- a/README.md +++ b/README.md @@ -151,10 +151,12 @@ Options: --heartbeat - to print progress heartbeats [0 - disabled] --stats - generating kernel executino stats, file .stats.csv + --sys-trace - to trace HIP/HSA APIs and GPU activity, generates stats and JSON trace chrome-tracing compatible --hip-trace - to trace HIP, generates API execution stats/trace and JSON file viewable in chrome tracing 'HCC_HOME' env var is required to be set to where 'hcc' is installed. --hsa-trace - to trace HSA, generates API execution stats/trace and JSON file viewable in chrome tracing - Generated files: .hsa_stats.txt .json + --kfd-trace - to trace KFD, generates API execution stats and JSON file chrome-tracing compatible" + Generated files: ._stats.txt .json Traced API list can be set by input .txt or .xml files. Input .txt: hsa: hsa_queue_create hsa_amd_memory_pool_allocate @@ -173,12 +175,13 @@ Options: Configuration file: You can set your parameters defaults preferences in the configuration file 'rpl_rc.xml'. The search path sequence: .:$HOME: First the configuration file is looking in the current directory, then in your home, and then in the package directory. - Configurable options: 'basenames', 'timestamp', 'ctx-limit', 'heartbeat'. + Configurable options: 'basenames', 'timestamp', 'ctx-limit', 'heartbeat', 'obj-tracking'. An example of 'rpl_rc.xml': ``` From 9560d8fc07780b6683a04a13c1d3afa2810dcbd8 Mon Sep 17 00:00:00 2001 From: eshcherb <33529668+eshcherb@users.noreply.github.com> Date: Tue, 7 Jan 2020 16:37:02 -0600 Subject: [PATCH 087/153] Update README.md --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 8281f6e1..4a8e949d 100644 --- a/README.md +++ b/README.md @@ -76,7 +76,7 @@ profiling includes HW performance counters with complex performance metrics. ## Profiling utility usage: ``` - rocprof [-h] [--list-basic] [--list-derived] [-i ] [-o ] +rocprof [-h] [--list-basic] [--list-derived] [-i ] [-o ] Options: -h - this help From 8f0e758589dfdc1b01ee91daa525b3eea5fddb4a Mon Sep 17 00:00:00 2001 From: eshcherb <33529668+eshcherb@users.noreply.github.com> Date: Tue, 7 Jan 2020 16:42:35 -0600 Subject: [PATCH 088/153] Update README.md --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 4a8e949d..c541d6db 100644 --- a/README.md +++ b/README.md @@ -182,6 +182,6 @@ Configuration file: timestamp=off ctx-limit=0 heartbeat=0 - obj-tracking=0 + obj-tracking=off > ``` From 315a69e2f38e3cbd0fa89d39d5b5e7786aedbbc0 Mon Sep 17 00:00:00 2001 From: eshcherb <33529668+eshcherb@users.noreply.github.com> Date: Tue, 7 Jan 2020 17:08:45 -0600 Subject: [PATCH 089/153] Update README.md --- README.md | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/README.md b/README.md index c541d6db..1aafa0a9 100644 --- a/README.md +++ b/README.md @@ -4,6 +4,10 @@ ROC profiler library. Profiling with perf-counters and derived metrics. Library HW specific low-level performance analysis interface for profiling of GPU compute applications. The profiling includes HW performance counters with complex performance metrics. +To use the rocProfiler API you need the API header and to link your application with roctracer .so librray: + - the API header: /opt/rocm/rocprofiler/include/rocprofiler.h + - the .so library: /opt/rocm/lib/librocprofiler64.so + ## Documentation - ['rocprof' cmdline tool specification](doc/rocprof.md) - ['rocprofiler' profiling C API specification](doc/rocprofiler_spec.md) From 3acd29ba630b8ffb3af3b166922bcc3d779a7221 Mon Sep 17 00:00:00 2001 From: eshcherb <33529668+eshcherb@users.noreply.github.com> Date: Sat, 11 Jan 2020 04:47:14 -0600 Subject: [PATCH 090/153] Update rocprofiler_spec.md --- doc/rocprofiler_spec.md | 1 - 1 file changed, 1 deletion(-) diff --git a/doc/rocprofiler_spec.md b/doc/rocprofiler_spec.md index 57bf2739..a7219cec 100644 --- a/doc/rocprofiler_spec.md +++ b/doc/rocprofiler_spec.md @@ -74,7 +74,6 @@ Library version: ``` ### 3.2. Returning the error and error string methods ``` -rocprofiler_errno_t rocprofiler_errno(); const char* rocprofiler_error_string(); ``` ### 3.3. Library version From 47a3ad58344f7f6efa0b84e9a210cf56b1e83dcc Mon Sep 17 00:00:00 2001 From: eshcherb <33529668+eshcherb@users.noreply.github.com> Date: Sat, 11 Jan 2020 05:04:33 -0600 Subject: [PATCH 091/153] Update rocprof.md --- doc/rocprof.md | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) diff --git a/doc/rocprof.md b/doc/rocprof.md index 648648fb..717653eb 100644 --- a/doc/rocprof.md +++ b/doc/rocprof.md @@ -153,7 +153,8 @@ The trace is generated by option ‘—hip-trace’ and includes HIP API timelin #### 2.2.2. ROCr runtime trace The trace is generated by option ‘—hsa-trace’ and includes ROCr API timelines and GPU activity at AQL queue level. Also, can provide counters per kernel. #### 2.2.3. KFD driver trace -Is planned to include Thunk API trace and memory allocations/migration tracing. +The trace is generated by option ‘—kfd-trace’ and includes KFD Thunk API timeline. +It is planned to add memory allocations/migration tracing. #### 2.2.4. Code annotation Support for application code annotation. Start/stop API is supported to programmatically control the profiling. @@ -230,6 +231,7 @@ Options: --verbose - verbose mode, dumping all base counters used in the input metrics --list-basic - to print the list of basic HW counters --list-derived - to print the list of derived metrics with formulas + --cmd-qts - quoting profiled cmd-line [on] -i <.txt|.xml file> - input file Input file .txt format, automatically rerun application for every pmc line: @@ -273,12 +275,16 @@ Options: --ctx-wait - to wait for outstanding contexts on profiler exit [on] --ctx-limit - maximum number of outstanding contexts [0 - unlimited] --heartbeat - to print progress heartbeats [0 - disabled] + --obj-tracking - to turn on/off kernels code objects tracking [off] --stats - generating kernel execution stats, file .stats.csv - --hsa-trace - to trace HSA, generates API execution stats and JSON file chrome-tracing compatible - --hip-trace - to trace HIP, generates API execution stats and JSON file chrome-tracing compatible + + --roctx-trace - to enable rocTX application code annotation trace, "Markers and Ranges" JSON trace section. --sys-trace - to trace HIP/HSA APIs and GPU activity, generates stats and JSON trace chrome-tracing compatible - Generated files: .hsa_stats.txt .json + --hip-trace - to trace HIP, generates API execution stats and JSON file chrome-tracing compatible + --hsa-trace - to trace HSA, generates API execution stats and JSON file chrome-tracing compatible + --kfd-trace - to trace KFD, generates API execution stats and JSON file chrome-tracing compatible + Generated files: ._stats.txt .json Traced API list can be set by input .txt or .xml files. Input .txt: hsa: hsa_queue_create hsa_amd_memory_pool_allocate @@ -288,19 +294,21 @@ Options: + --trace-start - to enable tracing on start [on] --trace-period - to enable trace with initial delay, with periodic sample length and rate Supported time formats: Configuration file: You can set your parameters defaults preferences in the configuration file 'rpl_rc.xml'. The search path sequence: .:/home/evgeny: First the configuration file is looking in the current directory, then in your home, and then in the package directory. - Configurable options: 'basenames', 'timestamp', 'ctx-limit', 'heartbeat'. + Configurable options: 'basenames', 'timestamp', 'ctx-limit', 'heartbeat', 'obj-tracking'. An example of 'rpl_rc.xml': ``` ## 6. Publicly available counters and metrics From 1b3ce5e0363dbf704de52f164bd4b8498b8f5b8b Mon Sep 17 00:00:00 2001 From: eshcherb <33529668+eshcherb@users.noreply.github.com> Date: Sat, 11 Jan 2020 05:10:02 -0600 Subject: [PATCH 092/153] Update README.md --- README.md | 25 +++++++++++++------------ 1 file changed, 13 insertions(+), 12 deletions(-) diff --git a/README.md b/README.md index 1aafa0a9..3d9da12e 100644 --- a/README.md +++ b/README.md @@ -87,6 +87,7 @@ Options: --verbose - verbose mode, dumping all base counters used in the input metrics --list-basic - to print the list of basic HW counters --list-derived - to print the list of derived metrics with formulas + --cmd-qts - quoting profiled cmd-line [on] -i <.txt|.xml file> - input file Input file .txt format, automatically rerun application for every pmc line: @@ -153,28 +154,28 @@ Options: --ctx-limit - maximum number of outstanding contexts [0 - unlimited] --heartbeat - to print progress heartbeats [0 - disabled] + --obj-tracking - to turn on/off kernels code objects tracking [off] - --stats - generating kernel executino stats, file .stats.csv + --stats - generating kernel execution stats, file .stats.csv + + --roctx-trace - to enable rocTX application code annotation trace, "Markers and Ranges" JSON trace section. --sys-trace - to trace HIP/HSA APIs and GPU activity, generates stats and JSON trace chrome-tracing compatible - --hip-trace - to trace HIP, generates API execution stats/trace and JSON file viewable in chrome tracing - 'HCC_HOME' env var is required to be set to where 'hcc' is installed. - --hsa-trace - to trace HSA, generates API execution stats/trace and JSON file viewable in chrome tracing - --kfd-trace - to trace KFD, generates API execution stats and JSON file chrome-tracing compatible" + --hip-trace - to trace HIP, generates API execution stats and JSON file chrome-tracing compatible + --hsa-trace - to trace HSA, generates API execution stats and JSON file chrome-tracing compatible + --kfd-trace - to trace KFD, generates API execution stats and JSON file chrome-tracing compatible Generated files: ._stats.txt .json Traced API list can be set by input .txt or .xml files. Input .txt: hsa: hsa_queue_create hsa_amd_memory_pool_allocate Input .xml: - + - - --roctx-trace - to enable rocTX applicatin code annotation trace; should be use in addition to the trace options above. - Will show the application code annotation with rocTX events: roctxMark, roctxRangePush, roctxRangePop in JSON trace - "Markers and Ranges" section. - Application code needs to be explicitely instrumented using rocTX events APIs. - See roctracer documentation on rocTX API details. + + --trace-start - to enable tracing on start [on] + --trace-period - to enable trace with initial delay, with periodic sample length and rate + Supported time formats: Configuration file: You can set your parameters defaults preferences in the configuration file 'rpl_rc.xml'. The search path sequence: .:$HOME: From 8b402d07f8d6a21397cfa37b98244eec81f40363 Mon Sep 17 00:00:00 2001 From: rkebichi <54912798+rkebichi@users.noreply.github.com> Date: Tue, 14 Jan 2020 11:11:05 -0500 Subject: [PATCH 093/153] Fix crash in the fill_ext_db --- bin/tblextr.py | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/bin/tblextr.py b/bin/tblextr.py index 2c4e442b..10120395 100755 --- a/bin/tblextr.py +++ b/bin/tblextr.py @@ -239,13 +239,17 @@ def fill_ext_db(table_name, db, indir, trace_name, api_pid): continue if cid == 2: + if not pid in range_stack: range_stack[pid] = {} pid_stack = range_stack[pid] + if not tid in pid_stack: pid_stack[tid] = [] rec_stack = pid_stack[tid] - rec_vals = rec_stack.pop() - rec_vals[1] = tms + if len(rec_stack) != 0: + rec_vals = rec_stack.pop() + rec_vals[1] = tms - db.insert_entry(table_handle, rec_vals) - record_id += 1 + if len(rec_vals) != 0: + db.insert_entry(table_handle, rec_vals) + record_id += 1 return 1 ############################################################# From 38348e9e170839908e39ad6b1142e4bba1543971 Mon Sep 17 00:00:00 2001 From: eshcherb <33529668+eshcherb@users.noreply.github.com> Date: Thu, 23 Jan 2020 16:56:55 -0600 Subject: [PATCH 094/153] Update rocprofiler_spec.md --- doc/rocprofiler_spec.md | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/rocprofiler_spec.md b/doc/rocprofiler_spec.md index a7219cec..566ce21f 100644 --- a/doc/rocprofiler_spec.md +++ b/doc/rocprofiler_spec.md @@ -1,4 +1,5 @@ # ROC Profiler Library Specification +API version 7 ## 1. High level overview ``` From c771b374326496fe755d6ade4c83f8fe994e8a4e Mon Sep 17 00:00:00 2001 From: eshcherb <33529668+eshcherb@users.noreply.github.com> Date: Thu, 23 Jan 2020 17:09:21 -0600 Subject: [PATCH 095/153] Update rocprofiler_spec.md --- doc/rocprofiler_spec.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/rocprofiler_spec.md b/doc/rocprofiler_spec.md index 566ce21f..efbc727f 100644 --- a/doc/rocprofiler_spec.md +++ b/doc/rocprofiler_spec.md @@ -1,5 +1,5 @@ # ROC Profiler Library Specification -API version 7 +ROC Profiler API version 7 ## 1. High level overview ``` From 2cd889ade33b9ed4308e9e804dcbb10adaa7a002 Mon Sep 17 00:00:00 2001 From: eshcherb <33529668+eshcherb@users.noreply.github.com> Date: Thu, 23 Jan 2020 21:58:09 -0600 Subject: [PATCH 096/153] Update README.md --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 3d9da12e..e706920e 100644 --- a/README.md +++ b/README.md @@ -154,7 +154,6 @@ Options: --ctx-limit - maximum number of outstanding contexts [0 - unlimited] --heartbeat - to print progress heartbeats [0 - disabled] - --obj-tracking - to turn on/off kernels code objects tracking [off] --stats - generating kernel execution stats, file .stats.csv @@ -176,6 +175,7 @@ Options: --trace-start - to enable tracing on start [on] --trace-period - to enable trace with initial delay, with periodic sample length and rate Supported time formats: + --obj-tracking - to turn on/off kernels code objects tracking [off] Configuration file: You can set your parameters defaults preferences in the configuration file 'rpl_rc.xml'. The search path sequence: .:$HOME: From b592051c632dd5f13d338747e3398f448b54c108 Mon Sep 17 00:00:00 2001 From: eshcherb <33529668+eshcherb@users.noreply.github.com> Date: Thu, 23 Jan 2020 22:01:47 -0600 Subject: [PATCH 097/153] Update README.md --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index e706920e..ec16a82d 100644 --- a/README.md +++ b/README.md @@ -154,9 +154,10 @@ Options: --ctx-limit - maximum number of outstanding contexts [0 - unlimited] --heartbeat - to print progress heartbeats [0 - disabled] + --obj-tracking - to turn on/off kernels code objects tracking [off] --stats - generating kernel execution stats, file .stats.csv - + --roctx-trace - to enable rocTX application code annotation trace, "Markers and Ranges" JSON trace section. --sys-trace - to trace HIP/HSA APIs and GPU activity, generates stats and JSON trace chrome-tracing compatible --hip-trace - to trace HIP, generates API execution stats and JSON file chrome-tracing compatible @@ -175,7 +176,6 @@ Options: --trace-start - to enable tracing on start [on] --trace-period - to enable trace with initial delay, with periodic sample length and rate Supported time formats: - --obj-tracking - to turn on/off kernels code objects tracking [off] Configuration file: You can set your parameters defaults preferences in the configuration file 'rpl_rc.xml'. The search path sequence: .:$HOME: From f840d11c20c48b5c2f14f7200b1e019445f95961 Mon Sep 17 00:00:00 2001 From: eshcherb <33529668+eshcherb@users.noreply.github.com> Date: Thu, 23 Jan 2020 22:03:34 -0600 Subject: [PATCH 098/153] Update README.md --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index ec16a82d..06aa990f 100644 --- a/README.md +++ b/README.md @@ -155,6 +155,7 @@ Options: --ctx-limit - maximum number of outstanding contexts [0 - unlimited] --heartbeat - to print progress heartbeats [0 - disabled] --obj-tracking - to turn on/off kernels code objects tracking [off] + To support V3 code-object. --stats - generating kernel execution stats, file .stats.csv From b575b925acd22933da1edd257526735323a9aab4 Mon Sep 17 00:00:00 2001 From: eshcherb <33529668+eshcherb@users.noreply.github.com> Date: Fri, 24 Jan 2020 13:00:20 -0600 Subject: [PATCH 099/153] Update README.md --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 06aa990f..4c433aaf 100644 --- a/README.md +++ b/README.md @@ -46,8 +46,8 @@ To use the rocProfiler API you need the API header and to link your application ``` - ROCm is required. - - Python2.7 is required. - The required modules: CppHeaderParser, argparse. + - Python is required. + The required modules: sqlite3, CppHeaderParser, argparse. To install: sudo pip install CppHeaderParser argparse From 7df4f3a46ee0ea26d87b4aa258ce5f6e9edcaf12 Mon Sep 17 00:00:00 2001 From: rkebichi <54912798+rkebichi@users.noreply.github.com> Date: Mon, 27 Jan 2020 17:48:02 -0500 Subject: [PATCH 100/153] Update tblextr.py --- bin/tblextr.py | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) diff --git a/bin/tblextr.py b/bin/tblextr.py index 10120395..215f79b5 100755 --- a/bin/tblextr.py +++ b/bin/tblextr.py @@ -239,17 +239,15 @@ def fill_ext_db(table_name, db, indir, trace_name, api_pid): continue if cid == 2: - if not pid in range_stack: range_stack[pid] = {} + if not pid in range_stack: fatal("ROCTX range begin not found, pid(" + pid + ")"); pid_stack = range_stack[pid] - if not tid in pid_stack: pid_stack[tid] = [] + if not tid in pid_stack: fatal("ROCTX range begin not found, tid(" + tid + ")"); rec_stack = pid_stack[tid] - if len(rec_stack) != 0: - rec_vals = rec_stack.pop() - rec_vals[1] = tms - - if len(rec_vals) != 0: - db.insert_entry(table_handle, rec_vals) - record_id += 1 + rec_vals = rec_stack.pop() + rec_vals[1] = tms + + db.insert_entry(table_handle, rec_vals) + record_id += 1 return 1 ############################################################# From 68113462b8de2bb30a50966aa642801fe498eaa7 Mon Sep 17 00:00:00 2001 From: eshcherb <33529668+eshcherb@users.noreply.github.com> Date: Wed, 29 Jan 2020 19:47:54 -0600 Subject: [PATCH 101/153] Update README.md non default python modules --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 4c433aaf..4529c326 100644 --- a/README.md +++ b/README.md @@ -47,7 +47,7 @@ To use the rocProfiler API you need the API header and to link your application - ROCm is required. - Python is required. - The required modules: sqlite3, CppHeaderParser, argparse. + The required modules: CppHeaderParser, argparse. To install: sudo pip install CppHeaderParser argparse From 03279c28d0725355bca5881b6d09b7ecca5ee957 Mon Sep 17 00:00:00 2001 From: Evgeny Date: Thu, 27 Feb 2020 15:07:49 -0600 Subject: [PATCH 102/153] 3.1 update --- CMakeLists.txt | 16 ++++++++++++++-- bin/rpl_run.sh | 2 +- 2 files changed, 15 insertions(+), 3 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 8b81b5a6..edc30d1a 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -22,6 +22,9 @@ cmake_minimum_required ( VERSION 2.8.12 ) +# Install prefix +set(CMAKE_INSTALL_PREFIX "/opt/rocm" CACHE PATH "Install path prefix default") + ## Verbose output. set ( CMAKE_VERBOSE_MAKEFILE TRUE CACHE BOOL "Verbose Output" FORCE ) @@ -45,13 +48,22 @@ message ( "-- LIB-VERSION: ${VERSION_MAJOR}.${VERSION_MINOR}.${VERSION_PATCH}" ) set ( BUILD_VERSION_MAJOR ${VERSION_MAJOR} ) set ( BUILD_VERSION_MINOR ${VERSION_MINOR} ) set ( BUILD_VERSION_PATCH ${VERSION_PATCH} ) -set ( LIB_VERSION_STRING "${BUILD_VERSION_MAJOR}.${BUILD_VERSION_MINOR}.${BUILD_VERSION_PATCH}" ) if ( DEFINED VERSION_BUILD AND NOT ${VERSION_BUILD} STREQUAL "" ) message ( "VERSION BUILD DEFINED ${VERSION_BUILD}" ) set ( BUILD_VERSION_PATCH "${BUILD_VERSION_PATCH}-${VERSION_BUILD}" ) endif () set ( BUILD_VERSION_STRING "${BUILD_VERSION_MAJOR}.${BUILD_VERSION_MINOR}.${BUILD_VERSION_PATCH}" ) +set ( LIB_VERSION_MAJOR ${VERSION_MAJOR} ) +set ( LIB_VERSION_MINOR ${VERSION_MINOR} ) +if ( ${ROCM_PATCH_VERSION} ) + set ( LIB_VERSION_PATCH ${ROCM_PATCH_VERSION} ) +else() + set ( LIB_VERSION_PATCH ${VERSION_PATCH} ) +endif() +set ( LIB_VERSION_STRING "${LIB_VERSION_MAJOR}.${LIB_VERSION_MINOR}.${LIB_VERSION_PATCH}" ) +message ( "-- LIB-VERSION STRING: ${LIB_VERSION_STRING}" ) + ## Set target and root/lib/test directory set ( TARGET_NAME "${ROCPROFILER_TARGET}" ) set ( ROOT_DIR "${CMAKE_CURRENT_SOURCE_DIR}" ) @@ -63,7 +75,7 @@ include ( ${LIB_DIR}/CMakeLists.txt ) ## Set the VERSION and SOVERSION values set_property ( TARGET ${TARGET_NAME} PROPERTY VERSION "${LIB_VERSION_STRING}" ) -set_property ( TARGET ${TARGET_NAME} PROPERTY SOVERSION "${BUILD_VERSION_MAJOR}" ) +set_property ( TARGET ${TARGET_NAME} PROPERTY SOVERSION "${LIB_VERSION_MAJOR}" ) ## If the library is a release, strip the target library if ( "${CMAKE_BUILD_TYPE}" STREQUAL release ) diff --git a/bin/rpl_run.sh b/bin/rpl_run.sh index ce492e81..d34888cd 100755 --- a/bin/rpl_run.sh +++ b/bin/rpl_run.sh @@ -239,7 +239,7 @@ run() { fi if [ "$KFD_TRACE" = 1 ] ; then API_TRACE=${API_TRACE}":kfd" - export LD_PRELOAD="libkfdwrapper64.so libhsakmt.so.1 $LD_PRELOAD" + export LD_PRELOAD="$TT_DIR/lib/libkfdwrapper64.so libhsakmt.so.1 $LD_PRELOAD" fi if [ "$HIP_TRACE" = 1 ] ; then API_TRACE=${API_TRACE}":hip" From c65b74bb0d306565dc399c7b36f514959fac494c Mon Sep 17 00:00:00 2001 From: rkebichi <54912798+rkebichi@users.noreply.github.com> Date: Wed, 4 Mar 2020 10:29:25 -0500 Subject: [PATCH 103/153] Update sqlitedb.py Add comments section to json with rocminfo and hipcc_version --- bin/sqlitedb.py | 35 ++++++++++++++++++++++++++++++++++- 1 file changed, 34 insertions(+), 1 deletion(-) diff --git a/bin/sqlitedb.py b/bin/sqlitedb.py index cd649e6a..3b494863 100644 --- a/bin/sqlitedb.py +++ b/bin/sqlitedb.py @@ -1,5 +1,6 @@ import csv, sqlite3, re, sys from functools import reduce +from txt2params import gen_params # SQLite Database class class SQLiteDB: @@ -97,12 +98,44 @@ def dump_csv(self, table_name, file_name): for raw in self._get_raws(table_name): fd.write(reduce(lambda a, b: str(a) + ',' + str(b), raw) + '\n') + # dump JSON trace def open_json(self, file_name): if not re.search(r'\.json$', file_name): raise Exception('wrong output file type: "' + file_name + '"' ) + status1, output1 = commands.getstatusoutput("/opt/rocm/bin/rocminfo > rocminfo.txt") + if status1 != 0 : + raise Exception('Could not run command: rocminfo') + params = gen_params('rocminfo.txt'); + + status2, output2 = commands.getstatusoutput("/opt/rocm/bin/hipcc --version > hipccversion.txt") + if status2 != 0 : + raise Exception('Could not run command: hipcc --version') + params2 = gen_params('hipccversion.txt'); + with open(file_name, mode='w') as fd: - fd.write('{ "traceEvents":[{}\n'); + cnt = 0 + fd.write('{\n') + fd.write('"comments": {\n') + fd.write(' "rocminfo": {\n') + for key in params: + cnt = cnt + 1 + if cnt == len(params): + fd.write(' "' + key + '": "' + params[key] + '"\n') + else: + fd.write(' "' + key + '": "' + params[key] + '",\n') + fd.write(' },\n') + cnt = 0 + fd.write(' "hipcc_version": {\n') + for key in params2: + cnt = cnt + 1 + if cnt == len(params2): + fd.write(' "' + key + '": "' + params2[key] + '"\n') + else: + fd.write(' "' + key + '": "' + params2[key] + '",\n') + fd.write(' }\n') + fd.write('},\n') + fd.write('"traceEvents":[{}\n'); def close_json(self, file_name): if not re.search(r'\.json$', file_name): From dc9bee75d645e23e648bd7d2bfdba8b2320c1b0f Mon Sep 17 00:00:00 2001 From: rkebichi <54912798+rkebichi@users.noreply.github.com> Date: Wed, 4 Mar 2020 10:36:28 -0500 Subject: [PATCH 104/153] Create txt2params.py --- bin/txt2params.py | 88 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 88 insertions(+) create mode 100644 bin/txt2params.py diff --git a/bin/txt2params.py b/bin/txt2params.py new file mode 100644 index 00000000..ce5a2a8c --- /dev/null +++ b/bin/txt2params.py @@ -0,0 +1,88 @@ +#!/usr/bin/python + +################################################################################ +# Copyright (c) 2018 Advanced Micro Devices, Inc. All rights reserved. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +# THE SOFTWARE. +################################################################################ + +import os, sys, re + +def gen_params(txtfile): + fields = {} + parent_field = '' + nbr_indent = 0 + nbr_indent_prev = 0 + check_for_dims = False + with open(txtfile) as fp: + for line in fp: + mv = re.match(r'HCC clang version\s+(.*)',line) + if mv: + key = 'HCCclangversion' + val = mv.group(1) + fields[key] = val + continue + if check_for_dims == True: + mc = re.match(r'\s*([x|y|z])\s+(.*)',line) + if mc: + key_sav = mc.group(1) + if parent_field != '': + key = parent_field + '_' + mc.group(1) + else: + key = mc.group(1) + val = re.sub(r"\s+", "", mc.group(2)) + fields[key] = val + if key_sav == 'z': + check_for_dims = False + nbr_indent_prev = nbr_indent + mi = re.search(r'^(\s+)\w+', line) + md = re.search(':', line) + if mi: + nbr_indent = len(mi.group(1)) / 2 #indentation cnt + else: + if not md: + tmp = re.sub(r"\s+", "", line) + if tmp.isalnum(): + parent_field = tmp + continue + + if nbr_indent < nbr_indent_prev: + pos = parent_field.rfind('_') + if pos != -1: + parent_field = parent_field[:pos] # remove last _* + + for lin in line.split(';'): + lin = re.sub(r"\s+", "", lin) + m = re.match(r'(.*):(.*)', lin) + if m: + key, val = m.group(1), m.group(2) + if parent_field != '': + key = parent_field + '_' + key + if val == '': + mk = re.match(r'.*Dimension',key) + if mk: # expect x,y,z on next 3 lines + check_for_dims = True + parent_field = key + else: + fields[key] = val + else: + if nbr_indent != nbr_indent_prev and not check_for_dims : + parent_field = parent_field + '_' + lin.replace(':','') + + return fields From 3b9438c443fb28a2e4a8d70bc853db70317a82b0 Mon Sep 17 00:00:00 2001 From: rkebichi <54912798+rkebichi@users.noreply.github.com> Date: Fri, 6 Mar 2020 10:29:04 -0500 Subject: [PATCH 105/153] Update sqlitedb.py --- bin/sqlitedb.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bin/sqlitedb.py b/bin/sqlitedb.py index 3b494863..472efcc7 100644 --- a/bin/sqlitedb.py +++ b/bin/sqlitedb.py @@ -116,7 +116,7 @@ def open_json(self, file_name): with open(file_name, mode='w') as fd: cnt = 0 fd.write('{\n') - fd.write('"comments": {\n') + fd.write('"otherData": {\n') fd.write(' "rocminfo": {\n') for key in params: cnt = cnt + 1 From 65ece940f97173efeaa80bb6d579a2c316a93b46 Mon Sep 17 00:00:00 2001 From: rkebichi <54912798+rkebichi@users.noreply.github.com> Date: Fri, 13 Mar 2020 18:08:47 -0400 Subject: [PATCH 106/153] Update tblextr.py --- bin/tblextr.py | 53 +++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 52 insertions(+), 1 deletion(-) diff --git a/bin/tblextr.py b/bin/tblextr.py index 2c4e442b..fd064758 100755 --- a/bin/tblextr.py +++ b/bin/tblextr.py @@ -25,6 +25,7 @@ import os, sys, re from sqlitedb import SQLiteDB import dform +from txt2params import gen_params # Parsing results in the format: #dispatch[0], queue_index(0), kernel_name("SimpleConvolution"), time(1048928000311041,1048928006154674,1048928006168274,1048928006170503): @@ -59,6 +60,48 @@ var_table = {} ############################################################# +def json_metadata_gen(sysinfo_file, index): + if not re.search(r'\.txt$', sysinfo_file): + raise Exception('wrong output file type: "' + sysinfo_file + '"' ) + if index == 1: + status, output = commands.getstatusoutput("/opt/rocm/bin/rocminfo > " + sysinfo_file) + if status != 0 : + raise Exception('Could not run command: rocminfo') + params = gen_params(sysinfo_file); + elif index == 2: + status, output = commands.getstatusoutput("/opt/rocm/bin/hipcc --version >" + sysinfo_file) + if status != 0 : + raise Exception('Could not run command: hipcc --version') + params = gen_params(sysinfo_file); + return params + +def json_metadata_write(jsonfile, params, params2): + with open(jsonfile, mode='a') as fd: + cnt = 0 + fd.write('],\n') + fd.write('"otherData": {\n') + fd.write(' "rocminfo": {\n') + for key in params: + cnt = cnt + 1 + if cnt == len(params): + fd.write(' "' + key + '": "' + params[key] + '"\n') + else: + fd.write(' "' + key + '": "' + params[key] + '",\n') + if len(params2) == 0: + fd.write(' }\n') + return + fd.write(' },\n') + cnt = 0 + fd.write(' "hipcc_version": {\n') + for key in params2: + cnt = cnt + 1 + if cnt == len(params2): + fd.write(' "' + key + '": "' + params2[key] + '"\n') + else: + fd.write(' "' + key + '": "' + params2[key] + '",\n') + fd.write(' }\n') + fd.write('}\n') + def fatal(msg): sys.stderr.write(sys.argv[0] + ": " + msg + "\n"); sys.exit(1) @@ -239,11 +282,13 @@ def fill_ext_db(table_name, db, indir, trace_name, api_pid): continue if cid == 2: + if not pid in range_stack: fatal("ROCTX range begin not found, pid(" + pid + ")"); pid_stack = range_stack[pid] + if not tid in pid_stack: fatal("ROCTX range begin not found, tid(" + tid + ")"); rec_stack = pid_stack[tid] rec_vals = rec_stack.pop() rec_vals[1] = tms - + db.insert_entry(table_handle, rec_vals) record_id += 1 @@ -449,6 +494,8 @@ def fill_ops_db(table_name, db, indir): hsa_statfile = re.sub(r'\.stats\.csv$', r'.hsa_stats.csv', statfile) hip_statfile = re.sub(r'\.stats\.csv$', r'.hip_stats.csv', statfile) kfd_statfile = re.sub(r'\.stats\.csv$', r'.kfd_stats.csv', statfile) + sysinfo_file = re.sub(r'\.stats\.csv$', r'.sysinfo_stats.txt', statfile) + params = json_metadata_gen(sysinfo_file, 1) with open(dbfile, mode='w') as fd: fd.truncate() db = SQLiteDB(dbfile) @@ -514,6 +561,9 @@ def fill_ops_db(table_name, db, indir): dform.post_process_data(db, 'OPS') dform.gen_ops_json_trace(db, 'OPS', GPU_BASE_PID, START_US, jsonfile) + sysinfo_file2 = re.sub(r'\.stats\.csv$', r'.sysinfo_stats2.txt', statfile) + params2 = json_metadata_gen(sysinfo_file2, 2) + if kfd_trace_found: dform.post_process_data(db, 'KFD') dform.gen_table_bins(db, 'KFD', kfd_statfile, 'Name', 'DurationNs') @@ -544,6 +594,7 @@ def fill_ops_db(table_name, db, indir): dep_id += len(tid_list) if any_trace_found: + json_metadata_write(jsonfile, params, params2) db.close_json(jsonfile); db.close() From e5048f68d45365a17dbdad85447477b4aba349b8 Mon Sep 17 00:00:00 2001 From: rkebichi <54912798+rkebichi@users.noreply.github.com> Date: Fri, 13 Mar 2020 18:11:30 -0400 Subject: [PATCH 107/153] Update sqlitedb.py --- bin/sqlitedb.py | 36 +++--------------------------------- 1 file changed, 3 insertions(+), 33 deletions(-) diff --git a/bin/sqlitedb.py b/bin/sqlitedb.py index 472efcc7..484b6488 100644 --- a/bin/sqlitedb.py +++ b/bin/sqlitedb.py @@ -103,45 +103,14 @@ def dump_csv(self, table_name, file_name): def open_json(self, file_name): if not re.search(r'\.json$', file_name): raise Exception('wrong output file type: "' + file_name + '"' ) - status1, output1 = commands.getstatusoutput("/opt/rocm/bin/rocminfo > rocminfo.txt") - if status1 != 0 : - raise Exception('Could not run command: rocminfo') - params = gen_params('rocminfo.txt'); - - status2, output2 = commands.getstatusoutput("/opt/rocm/bin/hipcc --version > hipccversion.txt") - if status2 != 0 : - raise Exception('Could not run command: hipcc --version') - params2 = gen_params('hipccversion.txt'); - with open(file_name, mode='w') as fd: - cnt = 0 - fd.write('{\n') - fd.write('"otherData": {\n') - fd.write(' "rocminfo": {\n') - for key in params: - cnt = cnt + 1 - if cnt == len(params): - fd.write(' "' + key + '": "' + params[key] + '"\n') - else: - fd.write(' "' + key + '": "' + params[key] + '",\n') - fd.write(' },\n') - cnt = 0 - fd.write(' "hipcc_version": {\n') - for key in params2: - cnt = cnt + 1 - if cnt == len(params2): - fd.write(' "' + key + '": "' + params2[key] + '"\n') - else: - fd.write(' "' + key + '": "' + params2[key] + '",\n') - fd.write(' }\n') - fd.write('},\n') - fd.write('"traceEvents":[{}\n'); + fd.write('{ "traceEvents":[{}\n'); def close_json(self, file_name): if not re.search(r'\.json$', file_name): raise Exception('wrong output file type: "' + file_name + '"' ) with open(file_name, mode='a') as fd: - fd.write(']}\n'); + fd.write('}') def label_json(self, pid, label, file_name): if not re.search(r'\.json$', file_name): @@ -264,3 +233,4 @@ def add_csv_table(self, table_name, file_name, extra = ()): self.insert_table(table, reader) ############################################################################################## + From 30ea6b7bd54bfb087a8daaa9d465750e9c758535 Mon Sep 17 00:00:00 2001 From: Evgeny Date: Wed, 18 Mar 2020 17:27:47 -0500 Subject: [PATCH 108/153] rocprofiler spec: correcting info_data_t --- doc/rocprofiler_spec.md | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/doc/rocprofiler_spec.md b/doc/rocprofiler_spec.md index efbc727f..c276a1f9 100644 --- a/doc/rocprofiler_spec.md +++ b/doc/rocprofiler_spec.md @@ -221,7 +221,11 @@ typedef struct { union { struct { const char* name; // metric name + uint32_t instances; // instances number + const char* expr; // metric expression, NULL for basic counters const char* description; // metric description + const char* block_name; // block name + uint32_t block_counters; // number of block counters } metric; struct { const char* name; // trace name From c355f87c5ab2f07a67a6d736f5617ed04050544a Mon Sep 17 00:00:00 2001 From: Evgeny Date: Wed, 18 Mar 2020 17:37:45 -0500 Subject: [PATCH 109/153] rocprofiler spec: correcting get info code example --- doc/rocprofiler_spec.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/doc/rocprofiler_spec.md b/doc/rocprofiler_spec.md index c276a1f9..25e61df7 100644 --- a/doc/rocprofiler_spec.md +++ b/doc/rocprofiler_spec.md @@ -595,11 +595,11 @@ Info data callback: switch (info.kind) { case ROCPROFILER_INFO_KIND_METRIC: { if (info.metric.expr != NULL) { - fprintf(stdout, "Basic counter: gpu-agent%d : %s : %s\n", + fprintf(stdout, "Derived counter: gpu-agent%d : %s : %s\n", info.agent_index, info.metric.name, info.metric.description); fprintf(stdout, " %s = %s\n", info.metric.name, info.metric.expr); } else { - fprintf(stdout, "Derived counter: gpu-agent%d : %s", + fprintf(stdout, "Basic counter: gpu-agent%d : %s", info.agent_index, info.metric.name); if (info.metric.instances > 1) { fprintf(stdout, "[0-%u]", info.metric.instances - 1); From 11c83187320d0f5db43f82b872beb473128d58a3 Mon Sep 17 00:00:00 2001 From: Evgeny Date: Thu, 19 Mar 2020 00:39:38 -0500 Subject: [PATCH 110/153] kernel build fix --- bin/build_kernel.sh | 39 +++++++++++++++++++++++++++++---------- test/ctrl/test_hsa.cpp | 9 +-------- 2 files changed, 30 insertions(+), 18 deletions(-) diff --git a/bin/build_kernel.sh b/bin/build_kernel.sh index 6c4afe6f..e89cf561 100755 --- a/bin/build_kernel.sh +++ b/bin/build_kernel.sh @@ -1,7 +1,9 @@ -#!/bin/sh +#!/bin/sh -x TEST_NAME=$1 DST_DIR=$2 +ROCM_DIR=$3 +TGT_LIST=$4 if [ -z "$TEST_NAME" ] ; then echo "Usage: $0 " @@ -13,18 +15,35 @@ if [ -z "$DST_DIR" ] ; then DST_DIR=$(dirname TEST_NAME) fi -GFXIP=$(/opt/rocm/bin/rocminfo | grep "amdgcn-amd-amdhsa--" | head -n 1 | sed -n "s/^.*amdgcn-amd-amdhsa--\(\w*\).*$/\1/p") -if [ -z "$GFXIP" ] ; then - echo "GPU is not found" - exit 1 +if [ -z "$ROCM_DIR" ] ; then + ROCM_DIR=/opt/rocm fi -OBJ_PREF=$(echo $GFXIP | head -c 4) -OBJ_NAME=$(echo "_$(basename $TEST_NAME)" | sed -e 's/_./\U&\E/g' -e 's/_//g') -OBJ_FILE=${OBJ_PREF}_${OBJ_NAME}.hsaco +if [ -z "$TGT_LIST" ] ; then + TGT_LIST=$(/opt/rocm/bin/rocminfo | grep "amdgcn-amd-amdhsa--" | head -n 1 | sed -n "s/^.*amdgcn-amd-amdhsa--\(\w*\).*$/\1/p") +fi -/opt/rocm/opencl/bin/x86_64/clang -cl-std=CL2.0 -cl-std=CL2.0 -include /opt/rocm/opencl/include/opencl-c.h -Xclang -mlink-bitcode-file -Xclang /opt/rocm/opencl/lib/x86_64/bitcode/opencl.amdgcn.bc -Xclang -mlink-bitcode-file -Xclang /opt/rocm/opencl/lib/x86_64/bitcode/ockl.amdgcn.bc -target amdgcn-amd-amdhsa -mcpu=$GFXIP -mno-code-object-v3 $TEST_NAME.cl -o $OBJ_FILE +if [ -z "$TGT_LIST" ] ; then + echo "Error: GPU targets not found" + exit 1 +fi -echo "'$OBJ_FILE' is generated for '$GFXIP'" +OCL_VER="2.0" +OCL_DIR=$ROCM_DIR/opencl + +LLVM_DIR=$ROCM_DIR/hcc +CLANG=$LLVM_DIR/bin/clang +BITCODE_OPTS="\ + -Xclang -mlink-bitcode-file -Xclang $LLVM_DIR/lib/opencl.amdgcn.bc \ + -Xclang -mlink-bitcode-file -Xclang $LLVM_DIR/lib/ockl.amdgcn.bc \ + -Xclang -mlink-bitcode-file -Xclang $LLVM_DIR/lib/ocml.amdgcn.bc" + +for GFXIP in $TGT_LIST ; do + OBJ_PREF=$GFXIP + OBJ_NAME=$(echo "_$(basename $TEST_NAME)" | sed -e 's/_./\U&\E/g' -e 's/_//g') + OBJ_FILE=${OBJ_PREF}_${OBJ_NAME}.hsaco + $CLANG -cl-std=CL$OCL_VER -include $OCL_DIR/include/opencl-c.h $BITCODE_OPTS -target amdgcn-amd-amdhsa -mcpu=$GFXIP -mno-code-object-v3 $TEST_NAME.cl -o $DST_DIR/$OBJ_FILE + echo "'$OBJ_FILE' is generated for '$GFXIP'" +done exit 0 diff --git a/test/ctrl/test_hsa.cpp b/test/ctrl/test_hsa.cpp index 3cb5dee7..47f788cf 100644 --- a/test/ctrl/test_hsa.cpp +++ b/test/ctrl/test_hsa.cpp @@ -82,14 +82,7 @@ bool TestHsa::Initialize(int /*arg_cnt*/, char** /*arg_list*/) { // Obtain the code object file name std::string agentName(agent_info_->name); - if (agentName.compare(0, 4, "gfx8") == 0) { - brig_path_obj_.append("gfx8"); - } else if (agentName.compare(0, 4, "gfx9") == 0) { - brig_path_obj_.append("gfx9"); - } else { - TEST_ASSERT(false); - return false; - } + brig_path_obj_.append(agentName); brig_path_obj_.append("_" + name_ + ".hsaco"); return true; From 741b8707a234674aef409f18c505c9d4cb68045a Mon Sep 17 00:00:00 2001 From: rkebichi <54912798+rkebichi@users.noreply.github.com> Date: Thu, 19 Mar 2020 17:19:12 -0400 Subject: [PATCH 111/153] Update sqlitedb.py --- bin/sqlitedb.py | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/bin/sqlitedb.py b/bin/sqlitedb.py index 484b6488..9fa1823c 100644 --- a/bin/sqlitedb.py +++ b/bin/sqlitedb.py @@ -97,7 +97,6 @@ def dump_csv(self, table_name, file_name): fd.write(','.join(fields) + '\n') for raw in self._get_raws(table_name): fd.write(reduce(lambda a, b: str(a) + ',' + str(b), raw) + '\n') - # dump JSON trace def open_json(self, file_name): @@ -232,5 +231,19 @@ def add_csv_table(self, table_name, file_name, extra = ()): table = self.add_table(table_name, descr, extra) self.insert_table(table, reader) + def metadata_json(self, jsonfile, sysinfo_file): + params = gen_params(sysinfo_file); + with open(jsonfile, mode='a') as fd: + cnt = 0 + fd.write('],\n') + fd.write('"otherData": {\n') + for key in params: + cnt = cnt + 1 + if cnt == len(params): + fd.write(' "' + key + '": "' + params[key] + '"\n') + else: + fd.write(' "' + key + '": "' + params[key] + '",\n') + fd.write(' }\n') + ############################################################################################## From 19d26f394aae9faa86ce7462bf24ae09fdccd2c9 Mon Sep 17 00:00:00 2001 From: rkebichi <54912798+rkebichi@users.noreply.github.com> Date: Thu, 19 Mar 2020 17:22:39 -0400 Subject: [PATCH 112/153] Update tblextr.py --- bin/tblextr.py | 64 ++++++++++++++------------------------------------ 1 file changed, 17 insertions(+), 47 deletions(-) diff --git a/bin/tblextr.py b/bin/tblextr.py index fd064758..35aebee8 100755 --- a/bin/tblextr.py +++ b/bin/tblextr.py @@ -25,7 +25,6 @@ import os, sys, re from sqlitedb import SQLiteDB import dform -from txt2params import gen_params # Parsing results in the format: #dispatch[0], queue_index(0), kernel_name("SimpleConvolution"), time(1048928000311041,1048928006154674,1048928006168274,1048928006170503): @@ -60,47 +59,16 @@ var_table = {} ############################################################# -def json_metadata_gen(sysinfo_file, index): - if not re.search(r'\.txt$', sysinfo_file): - raise Exception('wrong output file type: "' + sysinfo_file + '"' ) - if index == 1: - status, output = commands.getstatusoutput("/opt/rocm/bin/rocminfo > " + sysinfo_file) - if status != 0 : - raise Exception('Could not run command: rocminfo') - params = gen_params(sysinfo_file); - elif index == 2: - status, output = commands.getstatusoutput("/opt/rocm/bin/hipcc --version >" + sysinfo_file) - if status != 0 : - raise Exception('Could not run command: hipcc --version') - params = gen_params(sysinfo_file); - return params - -def json_metadata_write(jsonfile, params, params2): - with open(jsonfile, mode='a') as fd: - cnt = 0 - fd.write('],\n') - fd.write('"otherData": {\n') - fd.write(' "rocminfo": {\n') - for key in params: - cnt = cnt + 1 - if cnt == len(params): - fd.write(' "' + key + '": "' + params[key] + '"\n') - else: - fd.write(' "' + key + '": "' + params[key] + '",\n') - if len(params2) == 0: - fd.write(' }\n') - return - fd.write(' },\n') - cnt = 0 - fd.write(' "hipcc_version": {\n') - for key in params2: - cnt = cnt + 1 - if cnt == len(params2): - fd.write(' "' + key + '": "' + params2[key] + '"\n') - else: - fd.write(' "' + key + '": "' + params2[key] + '",\n') - fd.write(' }\n') - fd.write('}\n') +def metadata_gen(sysinfo_file, sysinfo_cmd): + if not re.search(r'\.txt$', sysinfo_file): + raise Exception('wrong output file type: "' + sysinfo_file + '"' ) + if re.search(r'rocminfo', sysinfo_cmd): + direct_str = " > " + else: + direct_str = " >> " + status, output = commands.getstatusoutput(sysinfo_cmd + direct_str + sysinfo_file) + if status != 0 : + raise Exception('Could not run command: ' + sysinfo_cmd) def fatal(msg): sys.stderr.write(sys.argv[0] + ": " + msg + "\n"); @@ -288,7 +256,7 @@ def fill_ext_db(table_name, db, indir, trace_name, api_pid): rec_stack = pid_stack[tid] rec_vals = rec_stack.pop() rec_vals[1] = tms - + db.insert_entry(table_handle, rec_vals) record_id += 1 @@ -495,7 +463,7 @@ def fill_ops_db(table_name, db, indir): hip_statfile = re.sub(r'\.stats\.csv$', r'.hip_stats.csv', statfile) kfd_statfile = re.sub(r'\.stats\.csv$', r'.kfd_stats.csv', statfile) sysinfo_file = re.sub(r'\.stats\.csv$', r'.sysinfo_stats.txt', statfile) - params = json_metadata_gen(sysinfo_file, 1) + metadata_gen(sysinfo_file, '/opt/rocm/bin/rocminfo') with open(dbfile, mode='w') as fd: fd.truncate() db = SQLiteDB(dbfile) @@ -561,8 +529,10 @@ def fill_ops_db(table_name, db, indir): dform.post_process_data(db, 'OPS') dform.gen_ops_json_trace(db, 'OPS', GPU_BASE_PID, START_US, jsonfile) - sysinfo_file2 = re.sub(r'\.stats\.csv$', r'.sysinfo_stats2.txt', statfile) - params2 = json_metadata_gen(sysinfo_file2, 2) + #sysinfo_file2 = re.sub(r'\.stats\.csv$', r'.sysinfo_stats2.txt', statfile) + #params2 = metadata_gen(sysinfo_file2, 2) + #params2 = + metadata_gen(sysinfo_file, '/opt/rocm/bin/hipcc --version') if kfd_trace_found: dform.post_process_data(db, 'KFD') @@ -594,7 +564,7 @@ def fill_ops_db(table_name, db, indir): dep_id += len(tid_list) if any_trace_found: - json_metadata_write(jsonfile, params, params2) + db.metadata_json(jsonfile, sysinfo_file) db.close_json(jsonfile); db.close() From d4e182cafd0aeae9388c7b28bfa0fb21a8619d7d Mon Sep 17 00:00:00 2001 From: rkebichi <54912798+rkebichi@users.noreply.github.com> Date: Thu, 19 Mar 2020 17:25:06 -0400 Subject: [PATCH 113/153] Update txt2params.py --- bin/txt2params.py | 27 ++++++++++++++++++++++++--- 1 file changed, 24 insertions(+), 3 deletions(-) diff --git a/bin/txt2params.py b/bin/txt2params.py index ce5a2a8c..358acc3a 100644 --- a/bin/txt2params.py +++ b/bin/txt2params.py @@ -24,6 +24,11 @@ import os, sys, re +# gen_params() takes a text file like the output of rocminfo cmd and parses it into a map {key,value} +# where key is the param and value is the value of this param +# for example: Threadmodel : "posix" +# it also processes encompasing sections to generate a full param name such as (section names separated by '_'): +# "Agent2_PoolInfo_ISAInfo_ISA1_WorkgroupMaxSizeperDimension_x": "1024(0x400)", def gen_params(txtfile): fields = {} parent_field = '' @@ -32,12 +37,24 @@ def gen_params(txtfile): check_for_dims = False with open(txtfile) as fp: for line in fp: - mv = re.match(r'HCC clang version\s+(.*)',line) + me = re.match(r'\*\*\* Done \*\*\*',line) #Marks the end of cmd + if me: + parent_field = '' + nbr_indent = 0 + nbr_indent_prev = 0 + check_for_dims = False + continue + mv = re.match(r'HCC clang version\s+(.*)',line) # outlier: only line with a version number and no ':', special case if mv: key = 'HCCclangversion' val = mv.group(1) fields[key] = val continue + # Variable 'check_for_dims' is True for text like this: + # Workgroup Max Size per Dimension: + # x 1024(0x400) + # y 1024(0x400) + # z 1024(0x400) if check_for_dims == True: mc = re.match(r'\s*([x|y|z])\s+(.*)',line) if mc: @@ -62,11 +79,14 @@ def gen_params(txtfile): parent_field = tmp continue - if nbr_indent < nbr_indent_prev: + if nbr_indent < nbr_indent_prev: pos = parent_field.rfind('_') if pos != -1: - parent_field = parent_field[:pos] # remove last _* + parent_field = parent_field[:pos] + # Process lines such as : + # Segment: GLOBAL; FLAGS: KERNARG, FINE GRAINED + # Size: 131897644(0x7dc992c) KB for lin in line.split(';'): lin = re.sub(r"\s+", "", lin) m = re.match(r'(.*):(.*)', lin) @@ -86,3 +106,4 @@ def gen_params(txtfile): parent_field = parent_field + '_' + lin.replace(':','') return fields + From 8f856d9fa96c1f2ac90e36fea59d0e819a650061 Mon Sep 17 00:00:00 2001 From: Evgeny Date: Fri, 20 Mar 2020 14:52:33 -0500 Subject: [PATCH 114/153] cleanup --- bin/tblextr.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/bin/tblextr.py b/bin/tblextr.py index d3c470e1..3f47c65d 100755 --- a/bin/tblextr.py +++ b/bin/tblextr.py @@ -85,7 +85,7 @@ def dbglog(msg): # parse results method def parse_res(infile): global max_gpu_id - if not os.path.isfile(infile): return # fatal("Error: input file '" + infile + "' not found") + if not os.path.isfile(infile): return inp = open(infile, 'r') beg_pattern = re.compile("^dispatch\[(\d*)\], (.*) kernel-name\(\"([^\"]*)\"\)") @@ -529,9 +529,6 @@ def fill_ops_db(table_name, db, indir): dform.post_process_data(db, 'OPS') dform.gen_ops_json_trace(db, 'OPS', GPU_BASE_PID, START_US, jsonfile) - #sysinfo_file2 = re.sub(r'\.stats\.csv$', r'.sysinfo_stats2.txt', statfile) - #params2 = metadata_gen(sysinfo_file2, 2) - #params2 = metadata_gen(sysinfo_file, '/opt/rocm/bin/hipcc --version') if kfd_trace_found: From 5fb023db4f547f183bd000dce307aad767f07e27 Mon Sep 17 00:00:00 2001 From: Evgeny Date: Tue, 24 Mar 2020 19:48:12 -0500 Subject: [PATCH 115/153] adding labels sort index --- bin/sqlitedb.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bin/sqlitedb.py b/bin/sqlitedb.py index 9fa1823c..805c954c 100644 --- a/bin/sqlitedb.py +++ b/bin/sqlitedb.py @@ -115,7 +115,7 @@ def label_json(self, pid, label, file_name): if not re.search(r'\.json$', file_name): raise Exception('wrong output file type: "' + file_name + '"' ) with open(file_name, mode='a') as fd: - fd.write(',{"args":{"name":"%s %s"},"ph":"M","pid":%s,"name":"process_name"}\n' %(self.section_index, label, pid)); + fd.write(',{"args":{"name":"%s"},"ph":"M","pid":%s,"name":"process_name","sort_index":%d}\n' %(label, pid, self.section_index)); self.section_index += 1 def flow_json(self, base_id, from_pid, from_tid, from_us_list, to_pid, to_us_dict, corr_id_list, start_us, file_name): From dd824f652a149ce31ea65300de4967d94f6556f8 Mon Sep 17 00:00:00 2001 From: eshcherb <33529668+eshcherb@users.noreply.github.com> Date: Thu, 30 Apr 2020 10:02:30 -0500 Subject: [PATCH 116/153] Update README.md --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 4529c326..1e5df698 100644 --- a/README.md +++ b/README.md @@ -47,9 +47,9 @@ To use the rocProfiler API you need the API header and to link your application - ROCm is required. - Python is required. - The required modules: CppHeaderParser, argparse. + The required modules: CppHeaderParser, argparse, sqlite3 To install: - sudo pip install CppHeaderParser argparse + sudo pip install CppHeaderParser argparse sqlite3 - To build and install to /opt/rocm/rocprofiler export CMAKE_PREFIX_PATH=/opt/rocm/include/hsa:/opt/rocm From a0a4bf10fd620271ae9be47be3f961a1085ae503 Mon Sep 17 00:00:00 2001 From: Evgeny Date: Thu, 30 Apr 2020 17:04:28 -0500 Subject: [PATCH 117/153] intercept test removing not needed headers --- bin/build_kernel.sh | 40 +++++++++++++++++++++++++++---------- test/app/intercept_test.cpp | 22 ++++++++------------ 2 files changed, 37 insertions(+), 25 deletions(-) diff --git a/bin/build_kernel.sh b/bin/build_kernel.sh index e89cf561..9412a68e 100755 --- a/bin/build_kernel.sh +++ b/bin/build_kernel.sh @@ -1,4 +1,5 @@ #!/bin/sh -x +SO_EXT="hsaco" TEST_NAME=$1 DST_DIR=$2 @@ -7,9 +8,10 @@ TGT_LIST=$4 if [ -z "$TEST_NAME" ] ; then echo "Usage: $0 " - echo " Will look for .cl and will build .so dynamic object library" + echo " Will look for .cl and will build .$SO_EXT dynamic code object library" exit 1 fi +OBJ_NAME=$(echo "_$(basename $TEST_NAME)" | sed -e 's/_./\U&\E/g' -e 's/_//g') if [ -z "$DST_DIR" ] ; then DST_DIR=$(dirname TEST_NAME) @@ -29,21 +31,37 @@ if [ -z "$TGT_LIST" ] ; then fi OCL_VER="2.0" -OCL_DIR=$ROCM_DIR/opencl -LLVM_DIR=$ROCM_DIR/hcc -CLANG=$LLVM_DIR/bin/clang +if [ -e $ROCM_DIR/llvm ] ; then + LLVM_DIR=$ROCM_DIR/llvm + LIB_DIR=$ROCM_DIR/lib +else + LLVM_DIR=$ROCM_DIR/hcc + LIB_DIR=$LLVM_DIR/lib +fi + +BC_DIR=$LIB_DIR/bitcode +if [ ! -d "$BC_DIR" ] ; then BC_DIR=$LIB_DIR; fi + +CLANG_ROOT=$LLVM_DIR/lib/clang +CLANG_DIR=`ls -d $CLANG_ROOT/* | head -n 1` +if [ "$CLANG_DIR" = "" ] ; then + echo "Error: LLVM clang library was not found" + exit 1 +fi + +BIN_DIR=$LLVM_DIR/bin +INC_DIR=$CLANG_DIR/include BITCODE_OPTS="\ - -Xclang -mlink-bitcode-file -Xclang $LLVM_DIR/lib/opencl.amdgcn.bc \ - -Xclang -mlink-bitcode-file -Xclang $LLVM_DIR/lib/ockl.amdgcn.bc \ - -Xclang -mlink-bitcode-file -Xclang $LLVM_DIR/lib/ocml.amdgcn.bc" + -Xclang -mlink-bitcode-file -Xclang $BC_DIR/opencl.amdgcn.bc \ + -Xclang -mlink-bitcode-file -Xclang $BC_DIR/ockl.amdgcn.bc \ + -Xclang -mlink-bitcode-file -Xclang $BC_DIR/ocml.amdgcn.bc" for GFXIP in $TGT_LIST ; do OBJ_PREF=$GFXIP - OBJ_NAME=$(echo "_$(basename $TEST_NAME)" | sed -e 's/_./\U&\E/g' -e 's/_//g') - OBJ_FILE=${OBJ_PREF}_${OBJ_NAME}.hsaco - $CLANG -cl-std=CL$OCL_VER -include $OCL_DIR/include/opencl-c.h $BITCODE_OPTS -target amdgcn-amd-amdhsa -mcpu=$GFXIP -mno-code-object-v3 $TEST_NAME.cl -o $DST_DIR/$OBJ_FILE - echo "'$OBJ_FILE' is generated for '$GFXIP'" + OBJ_FILE="${OBJ_PREF}_${OBJ_NAME}.$SO_EXT" + $BIN_DIR/clang -cl-std=CL$OCL_VER -include $INC_DIR/opencl-c.h $BITCODE_OPTS -target amdgcn-amd-amdhsa -mcpu=$GFXIP -mno-code-object-v3 $TEST_NAME.cl -o $DST_DIR/$OBJ_FILE + echo "'$OBJ_FILE' generated" done exit 0 diff --git a/test/app/intercept_test.cpp b/test/app/intercept_test.cpp index 876b3102..c2905d1e 100644 --- a/test/app/intercept_test.cpp +++ b/test/app/intercept_test.cpp @@ -30,14 +30,8 @@ THE SOFTWARE. #include #include -#include "ctrl/run_kernel.h" -#include "ctrl/test_aql.h" -#include "ctrl/test_hsa.h" #include "inc/rocprofiler.h" -#include "dummy_kernel/dummy_kernel.h" -#include "simple_convolution/simple_convolution.h" -#include "util/test_assert.h" -#include "util/xml.h" +#include "util/hsa_rsrc_factory.h" #define PUBLIC_API __attribute__((visibility("default"))) #define CONSTRUCTOR_API __attribute__((constructor)) @@ -228,7 +222,7 @@ hsa_status_t dispatch_callback(const rocprofiler_callback_data_t* callback_data, unsigned metrics_input(rocprofiler_feature_t** ret) { // Profiling feature objects - const unsigned feature_count = 9; + const unsigned feature_count = 6; rocprofiler_feature_t* features = new rocprofiler_feature_t[feature_count]; memset(features, 0, feature_count * sizeof(rocprofiler_feature_t)); @@ -245,12 +239,12 @@ unsigned metrics_input(rocprofiler_feature_t** ret) { features[4].name = "SQ_INSTS_VALU"; features[5].kind = ROCPROFILER_FEATURE_KIND_METRIC; features[5].name = "VALUInsts"; - features[6].kind = ROCPROFILER_FEATURE_KIND_METRIC; - features[6].name = "TCC_HIT_sum"; - features[7].kind = ROCPROFILER_FEATURE_KIND_METRIC; - features[7].name = "TCC_MISS_sum"; - features[8].kind = ROCPROFILER_FEATURE_KIND_METRIC; - features[8].name = "WRITE_SIZE"; +// features[6].kind = ROCPROFILER_FEATURE_KIND_METRIC; +// features[6].name = "TCC_HIT_sum"; +// features[7].kind = ROCPROFILER_FEATURE_KIND_METRIC; +// features[7].name = "TCC_MISS_sum"; +// features[8].kind = ROCPROFILER_FEATURE_KIND_METRIC; +// features[8].name = "WRITE_SIZE"; *ret = features; return feature_count; From fdd7d536ad1451fd51d4b699be5018e548581be0 Mon Sep 17 00:00:00 2001 From: Jatin Chaudhary Date: Fri, 5 Jun 2020 14:53:05 -0400 Subject: [PATCH 118/153] Adding reference instead of copy of container elements --- src/core/context.h | 2 +- src/core/metrics.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/core/context.h b/src/core/context.h index 856c7024..f3ab1294 100644 --- a/src/core/context.h +++ b/src/core/context.h @@ -257,7 +257,7 @@ class Context { void GetMetricsData() const { const MetricArgs args(info_map_); - for (const auto v : metrics_map_) { + for (const auto &v : metrics_map_) { const std::string& name = v.first; const Metric* metric = v.second; const xml::Expr* expr = metric->GetExpr(); diff --git a/src/core/metrics.h b/src/core/metrics.h index 57ec7c31..f9ae1fbd 100644 --- a/src/core/metrics.h +++ b/src/core/metrics.h @@ -277,7 +277,7 @@ class MetricsDict { std::cout << name << "=" << expr_obj->String() << "\n" << std::endl; #endif counters_vec_t counters_vec; - for (const std::string var : expr_obj->GetVars()) { + for (const auto& var : expr_obj->GetVars()) { auto it = cache_.find(var); if (it == cache_.end()) { EXC_RAISING(HSA_STATUS_ERROR, "Bad metric '" << name << "', var '" << var << "' is not found"); From 2ef4d5d58b97c34c30dd5b919cd32825e1482677 Mon Sep 17 00:00:00 2001 From: eshcherb <33529668+eshcherb@users.noreply.github.com> Date: Fri, 12 Jun 2020 10:36:15 -0500 Subject: [PATCH 119/153] Update README.md --- README.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/README.md b/README.md index 1e5df698..44589d6e 100644 --- a/README.md +++ b/README.md @@ -52,6 +52,8 @@ To use the rocProfiler API you need the API header and to link your application sudo pip install CppHeaderParser argparse sqlite3 - To build and install to /opt/rocm/rocprofiler + Please use release branches/tags of 'amd-master' branch for development version. + export CMAKE_PREFIX_PATH=/opt/rocm/include/hsa:/opt/rocm cd .../rocprofiler From 1debae51dcd72244791eb26853c5b808cf6830c2 Mon Sep 17 00:00:00 2001 From: eshcherb <33529668+eshcherb@users.noreply.github.com> Date: Fri, 26 Jun 2020 23:32:22 -0500 Subject: [PATCH 120/153] Update README.md --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 44589d6e..73e67716 100644 --- a/README.md +++ b/README.md @@ -44,7 +44,7 @@ To use the rocProfiler API you need the API header and to link your application ## To build with the current installed ROCM: ``` - - ROCm is required. + - Python is required. The required modules: CppHeaderParser, argparse, sqlite3 From 13fa7df93562798b69d1de0efd35a3e90f29f4c9 Mon Sep 17 00:00:00 2001 From: Evgeny Date: Fri, 26 Jun 2020 23:38:29 -0500 Subject: [PATCH 121/153] Revert "Update README.md" This reverts commit 1debae51dcd72244791eb26853c5b808cf6830c2. --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 73e67716..44589d6e 100644 --- a/README.md +++ b/README.md @@ -44,7 +44,7 @@ To use the rocProfiler API you need the API header and to link your application ## To build with the current installed ROCM: ``` - + - ROCm is required. - Python is required. The required modules: CppHeaderParser, argparse, sqlite3 From 4aa416fc83baecae7ce02c60ba17353c1df669c4 Mon Sep 17 00:00:00 2001 From: Evgeny Date: Fri, 26 Jun 2020 23:44:11 -0500 Subject: [PATCH 122/153] adding ROCm requirements --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index 44589d6e..9108409f 100644 --- a/README.md +++ b/README.md @@ -45,6 +45,7 @@ To use the rocProfiler API you need the API header and to link your application ## To build with the current installed ROCM: ``` - ROCm is required. + ROCr-runtime and roctracer are needed - Python is required. The required modules: CppHeaderParser, argparse, sqlite3 From c988f7f327ffb7dff09e7faf56f38d138e28d14f Mon Sep 17 00:00:00 2001 From: Evgeny Date: Thu, 16 Jul 2020 02:58:45 -0500 Subject: [PATCH 123/153] 3.6 update --- CMakeLists.txt | 76 ++++-- bin/build_kernel.sh | 2 +- bin/dform.py | 23 +- bin/rpl_run.sh | 116 +++++---- bin/run_tool.sh | 38 --- bin/sqlitedb.py | 67 ++++-- bin/tblextr.py | 194 ++++++++++----- bin/txt2params.py | 47 ++-- cmake_modules/env.cmake | 32 ++- inc/rocprofiler.h | 90 ++++++- src/CMakeLists.txt | 3 +- src/core/activity.cpp | 171 +++++++++++++ src/core/context.h | 25 +- src/core/hsa_interceptor.h | 385 ++++++++++++++++++++++++++++++ src/core/hsa_proxy_queue.h | 1 + src/core/intercept_queue.cpp | 8 +- src/core/intercept_queue.h | 135 +++++++---- src/core/metrics.h | 2 +- src/core/proxy_queue.h | 2 +- src/core/rocprofiler.cpp | 111 ++++++++- src/core/tracker.h | 16 +- src/util/hsa_rsrc_factory.cpp | 65 +++-- src/util/hsa_rsrc_factory.h | 39 ++- test/CMakeLists.txt | 51 ++-- test/app/intercept_test.cpp | 2 +- test/app/stand_intercept_test.cpp | 190 +++++++++++++++ test/app/standalone_test.cpp | 56 ++++- test/run.sh | 64 ++++- test/tool/tool.cpp | 103 +++++++- test/util/hsa_rsrc_factory.cpp | 32 ++- test/util/hsa_rsrc_factory.h | 105 +++++++- 31 files changed, 1879 insertions(+), 372 deletions(-) delete mode 100755 bin/run_tool.sh create mode 100644 src/core/activity.cpp create mode 100644 src/core/hsa_interceptor.h create mode 100644 test/app/stand_intercept_test.cpp diff --git a/CMakeLists.txt b/CMakeLists.txt index edc30d1a..8aac5175 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -70,6 +70,31 @@ set ( ROOT_DIR "${CMAKE_CURRENT_SOURCE_DIR}" ) set ( LIB_DIR "${ROOT_DIR}/src" ) set ( TEST_DIR "${ROOT_DIR}/test" ) +## Enable tracing API +if (NOT USE_PROF_API) + set(USE_PROF_API 1) +endif() + +# Protocol header lookup +set(PROF_API_HEADER_NAME prof_protocol.h) +if(USE_PROF_API EQUAL 1) + find_path(PROF_API_HEADER_DIR ${PROF_API_HEADER_NAME} + HINTS + ${PROF_API_HEADER_PATH} + PATHS + /opt/rocm/roctracer + PATH_SUFFIXES + include/ext + ) + if(NOT PROF_API_HEADER_DIR) + MESSAGE(FATAL_ERROR "Profiling API header not found. Tracer integration disabled. Use -DPROF_API_HEADER_PATH=") + else() + add_definitions(-DUSE_PROF_API=1) + include_directories(${PROF_API_HEADER_DIR}) + MESSAGE(STATUS "Profiling API: ${PROF_API_HEADER_DIR}/${PROF_API_HEADER_NAME}") + endif() +endif() + ## Build library include ( ${LIB_DIR}/CMakeLists.txt ) @@ -85,41 +110,58 @@ endif () ## Build tests add_subdirectory ( ${TEST_DIR} ${PROJECT_BINARY_DIR}/test ) +## Installation and packaging +set ( DEST_NAME ${ROCPROFILER_NAME} ) +if ( DEFINED CPACK_PACKAGING_INSTALL_PREFIX ) + get_filename_component ( pkg_name ${CPACK_PACKAGING_INSTALL_PREFIX} NAME ) + get_filename_component ( pkg_dir ${CPACK_PACKAGING_INSTALL_PREFIX} DIRECTORY ) + if ( pkg_name STREQUAL ${DEST_NAME} ) + set ( CPACK_PACKAGING_INSTALL_PREFIX ${pkg_dir} ) + endif () +else () + set ( CPACK_PACKAGING_INSTALL_PREFIX ${CMAKE_INSTALL_PREFIX} ) +endif () +message ( "CMake-install-prefix: ${CMAKE_INSTALL_PREFIX}" ) +message ( "CPack-install-prefix: ${CPACK_PACKAGING_INSTALL_PREFIX}" ) +message ( "-----------Dest-name: ${DEST_NAME}" ) + ## Create symlinks for packaging and install add_custom_target ( rocprof-link ALL WORKING_DIRECTORY ${PROJECT_BINARY_DIR} - COMMAND ${CMAKE_COMMAND} -E create_symlink ../${ROCPROFILER_NAME}/bin/rpl_run.sh rocprof-link ) -add_custom_target ( inc-link ALL WORKING_DIRECTORY ${PROJECT_BINARY_DIR} - COMMAND ${CMAKE_COMMAND} -E create_symlink ../${ROCPROFILER_NAME}/include inc-link ) + COMMAND ${CMAKE_COMMAND} -E create_symlink ../${DEST_NAME}/bin/rpl_run.sh rocprof-link ) +#add_custom_target ( inc-link ALL WORKING_DIRECTORY ${PROJECT_BINARY_DIR} +# COMMAND ${CMAKE_COMMAND} -E create_symlink ../${DEST_NAME}/include inc-link ) add_custom_target ( so-link ALL WORKING_DIRECTORY ${PROJECT_BINARY_DIR} - COMMAND ${CMAKE_COMMAND} -E create_symlink ../${ROCPROFILER_NAME}/lib/${ROCPROFILER_LIBRARY}.so so-link ) + COMMAND ${CMAKE_COMMAND} -E create_symlink ../${DEST_NAME}/lib/${ROCPROFILER_LIBRARY}.so so-link ) -set ( CMAKE_INSTALL_PREFIX "${CMAKE_INSTALL_PREFIX}/${ROCPROFILER_NAME}" ) -message ( "---------Install-Dir: ${CMAKE_INSTALL_PREFIX}" ) -install ( TARGETS ${ROCPROFILER_TARGET} LIBRARY DESTINATION lib ) -install ( FILES ${CMAKE_CURRENT_SOURCE_DIR}/inc/rocprofiler.h DESTINATION include ) +# Install header and library +install ( TARGETS ${ROCPROFILER_TARGET} LIBRARY DESTINATION ${DEST_NAME}/lib ) +install ( FILES ${CMAKE_CURRENT_SOURCE_DIR}/inc/rocprofiler.h DESTINATION ${DEST_NAME}/include ) +install ( FILES ${CMAKE_CURRENT_SOURCE_DIR}/inc/rocprofiler.h DESTINATION include/${DEST_NAME} ) # rpl_run.sh tblextr.py txt2xml.sh install ( FILES ${CMAKE_CURRENT_SOURCE_DIR}/bin/rpl_run.sh ${CMAKE_CURRENT_SOURCE_DIR}/bin/txt2xml.sh + ${CMAKE_CURRENT_SOURCE_DIR}/bin/txt2params.py ${CMAKE_CURRENT_SOURCE_DIR}/bin/tblextr.py ${CMAKE_CURRENT_SOURCE_DIR}/bin/dform.py ${CMAKE_CURRENT_SOURCE_DIR}/bin/sqlitedb.py - DESTINATION bin + DESTINATION ${DEST_NAME}/bin PERMISSIONS OWNER_READ OWNER_EXECUTE GROUP_READ GROUP_EXECUTE WORLD_READ WORLD_EXECUTE ) -install ( FILES ${PROJECT_BINARY_DIR}/inc-link DESTINATION ../include RENAME ${ROCPROFILER_NAME} ) -install ( FILES ${PROJECT_BINARY_DIR}/so-link DESTINATION ../lib RENAME ${ROCPROFILER_LIBRARY}.so ) -install ( FILES ${PROJECT_BINARY_DIR}/rocprof-link DESTINATION ../bin - PERMISSIONS OWNER_READ OWNER_EXECUTE GROUP_READ GROUP_EXECUTE WORLD_READ WORLD_EXECUTE - RENAME rocprof ) # gfx_metrics.xml metrics.xml install ( FILES ${CMAKE_CURRENT_SOURCE_DIR}/test/tool/metrics.xml ${CMAKE_CURRENT_SOURCE_DIR}/test/tool/gfx_metrics.xml - DESTINATION lib ) + DESTINATION ${DEST_NAME}/lib ) # libtool.so -install ( FILES ${PROJECT_BINARY_DIR}/test/libtool.so DESTINATION tool ) -install ( FILES ${PROJECT_BINARY_DIR}/test/ctrl DESTINATION tool +install ( FILES ${PROJECT_BINARY_DIR}/test/libtool.so DESTINATION ${DEST_NAME}/tool ) +install ( FILES ${PROJECT_BINARY_DIR}/test/ctrl DESTINATION ${DEST_NAME}/tool PERMISSIONS OWNER_READ OWNER_EXECUTE GROUP_READ GROUP_EXECUTE WORLD_READ WORLD_EXECUTE ) +# links +install ( FILES ${PROJECT_BINARY_DIR}/so-link DESTINATION lib RENAME ${ROCPROFILER_LIBRARY}.so ) +#install ( FILES ${PROJECT_BINARY_DIR}/inc-link DESTINATION include RENAME ${DEST_NAME} ) +install ( FILES ${PROJECT_BINARY_DIR}/rocprof-link DESTINATION bin + PERMISSIONS OWNER_READ OWNER_EXECUTE GROUP_READ GROUP_EXECUTE WORLD_READ WORLD_EXECUTE + RENAME rocprof ) ## Packaging directives set ( CPACK_GENERATOR "DEB" "RPM" "TGZ" ) diff --git a/bin/build_kernel.sh b/bin/build_kernel.sh index 9412a68e..8ed0f168 100755 --- a/bin/build_kernel.sh +++ b/bin/build_kernel.sh @@ -22,7 +22,7 @@ if [ -z "$ROCM_DIR" ] ; then fi if [ -z "$TGT_LIST" ] ; then - TGT_LIST=$(/opt/rocm/bin/rocminfo | grep "amdgcn-amd-amdhsa--" | head -n 1 | sed -n "s/^.*amdgcn-amd-amdhsa--\(\w*\).*$/\1/p") + TGT_LIST=`$ROCM_DIR/bin/rocminfo | grep "amdgcn-amd-amdhsa--" | head -n 1 | sed -n "s/^.*amdgcn-amd-amdhsa--\(\w*\).*$/\1/p"` fi if [ -z "$TGT_LIST" ] ; then diff --git a/bin/dform.py b/bin/dform.py index 93194608..82a81d08 100644 --- a/bin/dform.py +++ b/bin/dform.py @@ -1,4 +1,25 @@ -#!/usr/bin/python +################################################################################ +# Copyright (c) 2018 Advanced Micro Devices, Inc. All rights reserved. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +# THE SOFTWARE. +################################################################################ + from sqlitedb import SQLiteDB def gen_message(outfile): diff --git a/bin/rpl_run.sh b/bin/rpl_run.sh index d34888cd..0c3d83d4 100755 --- a/bin/rpl_run.sh +++ b/bin/rpl_run.sh @@ -35,13 +35,8 @@ RPL_PATH=$PKG_DIR/lib TLIB_PATH=$PKG_DIR/tool TTLIB_PATH=$TT_DIR/tool -# Default HIP path -if [ -z "$HIP_PATH" ] ; then - export HIP_PATH=/opt/rocm/hip -fi -# Default HCC path -if [ -z "$HCC_HOME" ] ; then - export HCC_HOME=/opt/rocm/hcc +if [ -z "$ROCP_PYTHON_VERSION" ] ; then + ROCP_PYTHON_VERSION=python3 fi # runtime API trace @@ -65,9 +60,9 @@ export HSA_VEN_AMD_AQLPROFILE_LOG=1 export ROCPROFILER_LOG=1 unset ROCPROFILER_SESS -# ROC Profiler environment -# Loading of ROC Profiler by HSA runtime -export HSA_TOOLS_LIB=$RPL_PATH/librocprofiler64.so +# Profiler environment +# Loading of profiler library by HSA runtime +MY_HSA_TOOLS_LIB="$RPL_PATH/librocprofiler64.so" # Loading of the test tool by ROC Profiler export ROCP_TOOL_LIB=$TLIB_PATH/libtool.so # Enabling HSA dispatches intercepting by ROC PRofiler @@ -162,14 +157,18 @@ usage() { echo " --ctx-wait - to wait for outstanding contexts on profiler exit [on]" echo " --ctx-limit - maximum number of outstanding contexts [0 - unlimited]" echo " --heartbeat - to print progress heartbeats [0 - disabled]" + echo " --obj-tracking - to turn on/off kernels code objects tracking [on]" + echo " To support V3 code object" echo "" echo " --stats - generating kernel execution stats, file .stats.csv" - echo " --roctx-trace - to enable rocTX trace" - echo " --kfd-trace - to trace KFD, generates API execution stats and JSON file chrome-tracing compatible" + echo "" + echo " --roctx-trace - to enable rocTX application code annotation trace, \"Markers and Ranges\" JSON trace section." + echo " --hip-trace - to trace HIP, generates API execution stats and JSON file chrome-tracing compatible" echo " --hsa-trace - to trace HSA, generates API execution stats and JSON file chrome-tracing compatible" echo " --sys-trace - to trace HIP/HSA APIs and GPU activity, generates stats and JSON trace chrome-tracing compatible" - echo " --hip-trace - to trace HIP, generates API execution stats and JSON file chrome-tracing compatible" - echo " Generated files: .hsa_stats.txt .json" + echo " '--hsa-trace' can be used in addition to select activity tracing from HSA (ROCr runtime) level" + echo " --kfd-trace - to trace KFD, generates KFD Thunk API execution stats and JSON file chrome-tracing compatible" + echo " Generated files: ._stats.txt .json" echo " Traced API list can be set by input .txt or .xml files." echo " Input .txt:" echo " hsa: hsa_queue_create hsa_amd_memory_pool_allocate" @@ -182,23 +181,29 @@ usage() { echo " --trace-start - to enable tracing on start [on]" echo " --trace-period - to enable trace with initial delay, with periodic sample length and rate" echo " Supported time formats: " - echo " --obj-tracking - to turn on/off kernels code objects tracking [off]" + echo " --flush-rate - to enable trace flush rate (time period)" + echo " Supported time formats: " echo "" echo "Configuration file:" echo " You can set your parameters defaults preferences in the configuration file 'rpl_rc.xml'. The search path sequence: .:${HOME}:" echo " First the configuration file is looking in the current directory, then in your home, and then in the package directory." - echo " Configurable options: 'basenames', 'timestamp', 'ctx-limit', 'heartbeat'." + echo " Configurable options: 'basenames', 'timestamp', 'ctx-limit', 'heartbeat', 'obj-tracking'." echo " An example of 'rpl_rc.xml':" echo " " echo "" exit 1 } +# checking for availability of rocminfo utility +`which rocminfo >/dev/null 2>&1` +if [ $? != 0 ]; then fatal "'rocminfo' utility is not found: please add ROCM bin path to PATH env var."; fi + # profiling run method OUTPUT_LIST="" run() { @@ -233,13 +238,13 @@ run() { fi API_TRACE="" - LD_PRELOAD="" + MY_LD_PRELOAD="" if [ "$ROCTX_TRACE" = 1 ] ; then API_TRACE=${API_TRACE}":roctx" fi if [ "$KFD_TRACE" = 1 ] ; then API_TRACE=${API_TRACE}":kfd" - export LD_PRELOAD="$TT_DIR/lib/libkfdwrapper64.so libhsakmt.so.1 $LD_PRELOAD" + MY_LD_PRELOAD="$TT_DIR/lib/libkfdwrapper64.so libhsakmt.so.1 $MY_LD_PRELOAD" fi if [ "$HIP_TRACE" = 1 ] ; then API_TRACE=${API_TRACE}":hip" @@ -250,34 +255,44 @@ run() { if [ "$HSA_TRACE" = 1 ] ; then export ROCTRACER_DOMAIN=$API_TRACE":hsa" - export HSA_TOOLS_LIB="$HSA_TOOLS_LIB $TTLIB_PATH/libtracer_tool.so" + MY_HSA_TOOLS_LIB="$MY_HSA_TOOLS_LIB $TTLIB_PATH/libtracer_tool.so" elif [ -n "$API_TRACE" ] ; then export ROCTRACER_DOMAIN=$API_TRACE OUTPUT_LIST="$ROCP_OUTPUT_DIR/" - export HSA_TOOLS_LIB="$TTLIB_PATH/libtracer_tool.so" + MY_HSA_TOOLS_LIB="$TTLIB_PATH/libtracer_tool.so" fi - redirection_cmd="" + retval=1 if [ -n "$ROCP_OUTPUT_DIR" ] ; then - redirection_cmd="2>&1 | tee $ROCP_OUTPUT_DIR/log.txt" + log_file="$ROCP_OUTPUT_DIR/log.txt" + exit_file="$ROCP_OUTPUT_DIR/exit.txt" + { + HSA_TOOLS_LIB="$MY_HSA_TOOLS_LIB" LD_PRELOAD="$MY_LD_PRELOAD" eval "$APP_CMD" + retval=$? + echo "exit($retval)" > $exit_file + } 2>&1 | tee "$log_file" + exitval=`cat "$exit_file" | sed -n "s/^.*exit(\([0-9]*\)).*$/\1/p"` + if [ -n "$exitval" ] ; then retval=$exitval; fi + else + HSA_TOOLS_LIB="$MY_HSA_TOOLS_LIB" LD_PRELOAD="$MY_LD_PRELOAD" eval "$APP_CMD" + retval=$? fi - - CMD_LINE="$APP_CMD $redirection_cmd" - eval "$CMD_LINE" - - unset LD_PRELOAD + return $retval } merge_output() { - output_dir=$(echo "$1" | sed "s/\/[^\/]*$//") - for file_name in `ls $output_dir` ; do - output_name=$(echo $file_name | sed -n "/\.txt$/ s/^[0-9]*_//p") - if [ -n "$output_name" ] ; then - trace_file=$output_dir/$file_name - output_file=$output_dir/$output_name - touch $output_file - cat $trace_file >> $output_file - fi + while [ -n "$1" ] ; do + output_dir=$(echo "$1" | sed "s/\/[^\/]*$//") + for file_name in `ls $output_dir` ; do + output_name=$(echo $file_name | sed -n "/\.txt$/ s/^[0-9]*_//p") + if [ -n "$output_name" ] ; then + trace_file=$output_dir/$file_name + output_file=$output_dir/$output_name + touch $output_file + cat $trace_file >> $output_file + fi + done + shift done } @@ -339,11 +354,11 @@ while [ 1 ] ; do export ROCP_METRICS="$2" elif [ "$1" = "--list-basic" ] ; then export ROCP_INFO=b - eval "$PKG_DIR/tool/ctrl" + HSA_TOOLS_LIB="$MY_HSA_TOOLS_LIB" eval "$PKG_DIR/tool/ctrl" exit 1 elif [ "$1" = "--list-derived" ] ; then export ROCP_INFO=d - eval "$PKG_DIR/tool/ctrl" + HSA_TOOLS_LIB="$MY_HSA_TOOLS_LIB" eval "$PKG_DIR/tool/ctrl" exit 1 elif [ "$1" = "--basenames" ] ; then if [ "$2" = "on" ] ; then @@ -373,6 +388,7 @@ while [ 1 ] ; do GEN_STATS=1 elif [ "$1" = "--roctx-trace" ] ; then ARG_VAL=0 + GEN_STATS=1 ROCTX_TRACE=1 elif [ "$1" = "--kfd-trace" ] ; then ARG_VAL=0 @@ -414,9 +430,14 @@ while [ 1 ] ; do convert_time_val period_rate errck "Option '$ARG_IN', rate value" export ROCP_CTRL_RATE="$period_delay:$period_len:$period_rate" + elif [ "$1" = "--flush-rate" ] ; then + period_rate=$2 + convert_time_val period_rate + errck "Option '$ARG_IN', rate value" + export ROCP_FLUSH_RATE="$period_rate" elif [ "$1" = "--obj-tracking" ] ; then - if [ "$2" = "on" ] ; then - export ROCP_OBJ_TRACKING=1 + if [ "$2" = "off" ] ; then + export ROCP_OBJ_TRACKING=0 fi elif [ "$1" = "--verbose" ] ; then ARG_VAL=0 @@ -506,11 +527,13 @@ if [ -n "$csv_output" ] ; then rm -f $csv_output fi -RET=0 +RET=1 for name in $input_list; do run $name $OUTPUT_DIR $APP_CMD + RET=$? if [ -n "$ROCPROFILER_SESS" -a -e "$ROCPROFILER_SESS/error" ] ; then - echo "Error found, profiling aborted." + error_string=`cat $ROCPROFILER_SESS/error` + echo "Profiling error found: '$error_string'" csv_output="" RET=1 break @@ -518,15 +541,16 @@ for name in $input_list; do done if [ -n "$csv_output" ] ; then + merge_output $OUTPUT_LIST if [ "$GEN_STATS" = "1" ] ; then db_output=$(echo $csv_output | sed "s/\.csv/.db/") - merge_output $OUTPUT_LIST - python $BIN_DIR/tblextr.py $db_output $OUTPUT_LIST + $ROCP_PYTHON_VERSION $BIN_DIR/tblextr.py $db_output $OUTPUT_LIST else - python $BIN_DIR/tblextr.py $csv_output $OUTPUT_LIST + $ROCP_PYTHON_VERSION $BIN_DIR/tblextr.py $csv_output $OUTPUT_LIST fi if [ "$?" -ne 0 ] ; then - echo "Data extracting error: $OUTPUT_LIST'" + echo "Profiling data corrupted: '$OUTPUT_LIST'" | tee "$ROCPROFILER_SESS/error" + RET=1 fi fi diff --git a/bin/run_tool.sh b/bin/run_tool.sh deleted file mode 100755 index ed1609fa..00000000 --- a/bin/run_tool.sh +++ /dev/null @@ -1,38 +0,0 @@ -#!/bin/sh -BIN_DIR=`dirname $0` -BIN_DIR=`realpath $BIN_DIR` -PKG_DIR=${BIN_DIR%/bin} - -# PATH to custom HSA libs -HSA_PATH=$PKG_DIR/lib/hsa - -if [ -z "$1" ] ; then - echo "Usage: $0 " - exit 1 -fi - -# profiler plugin library -test_app=$* - -# paths to ROC profiler and oher libraries -export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$PKG_DIR/lib:$PKG_DIR/tool:$HSA_PATH -export PATH=.:$PATH - -# ROC profiler library loaded by HSA runtime -export HSA_TOOLS_LIB=librocprofiler64.so.1 -# tool library loaded by ROC profiler -if [ -z "$ROCP_TOOL_LIB" ] ; then - export ROCP_TOOL_LIB=libintercept_test.so -fi -# enable error messages -export HSA_TOOLS_REPORT_LOAD_FAILURE=1 -export HSA_VEN_AMD_AQLPROFILE_LOG=1 -export ROCPROFILER_LOG=1 -# ROC profiler metrics config file -unset ROCP_PROXY_QUEUE -# ROC profiler metrics config file -if [ -z "$ROCP_METRICS" ] ; then - export ROCP_METRICS=$PKG_DIR/lib/metrics.xml -fi - -LD_PRELOAD=$ROCP_TOOL_LIB $test_app diff --git a/bin/sqlitedb.py b/bin/sqlitedb.py index 805c954c..eb584503 100644 --- a/bin/sqlitedb.py +++ b/bin/sqlitedb.py @@ -1,3 +1,25 @@ +################################################################################ +# Copyright (c) 2018 Advanced Micro Devices, Inc. All rights reserved. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +# THE SOFTWARE. +################################################################################ + import csv, sqlite3, re, sys from functools import reduce from txt2params import gen_params @@ -48,6 +70,13 @@ def add_data_column(self, table_name, data_label, data_type, data_expr): cursor.execute('ALTER TABLE %s ADD COLUMN "%s" %s' % (table_name, data_label, data_type)) cursor.execute('UPDATE %s SET %s = (%s);' % (table_name, data_label, data_expr)) + def change_rec_name(self, table_name, rec_id, rec_name): + self.connection.execute('UPDATE ' + table_name + ' SET Name = ? WHERE "Index" = ?', (rec_name, rec_id)) + def change_rec_tid(self, table_name, rec_id, tid): + self.connection.execute('UPDATE ' + table_name + ' SET tid = ? WHERE "Index" = ?', (tid, rec_id)) + def change_rec_fld(self, table_name, fld_expr, rec_pat): + self.connection.execute('UPDATE ' + table_name + ' SET ' + fld_expr + ' WHERE ' + rec_pat) + # populate DB table entry def insert_entry(self, table, val_list): (cursor, stm) = table @@ -77,8 +106,8 @@ def _get_raws(self, table_name): def _get_raws_indexed(self, table_name): cursor = self.connection.execute('SELECT * FROM ' + table_name + ' order by "Index" asc;') return cursor.fetchall() - def _get_raw_by_id(self, table_name, req_id): - cursor = self.connection.execute('SELECT * FROM ' + table_name + ' WHERE "Index"=?', (req_id,)) + def _get_raw_by_id(self, table_name, rec_id): + cursor = self.connection.execute('SELECT * FROM ' + table_name + ' WHERE "Index"=?', (rec_id,)) raws = cursor.fetchall() if len(raws) != 1: raise Exception('Index is not unique, table "' + table_name + '"') @@ -97,7 +126,7 @@ def dump_csv(self, table_name, file_name): fd.write(','.join(fields) + '\n') for raw in self._get_raws(table_name): fd.write(reduce(lambda a, b: str(a) + ',' + str(b), raw) + '\n') - + # dump JSON trace def open_json(self, file_name): if not re.search(r'\.json$', file_name): @@ -115,7 +144,7 @@ def label_json(self, pid, label, file_name): if not re.search(r'\.json$', file_name): raise Exception('wrong output file type: "' + file_name + '"' ) with open(file_name, mode='a') as fd: - fd.write(',{"args":{"name":"%s"},"ph":"M","pid":%s,"name":"process_name","sort_index":%d}\n' %(label, pid, self.section_index)); + fd.write(',{"args":{"name":"%s"},"ph":"M","pid":%s,"name":"process_name","sort_index":%d}\n' %(label, pid, self.section_index)) self.section_index += 1 def flow_json(self, base_id, from_pid, from_tid, from_us_list, to_pid, to_us_dict, corr_id_list, start_us, file_name): @@ -134,6 +163,21 @@ def flow_json(self, base_id, from_pid, from_tid, from_us_list, to_pid, to_us_dic fd.write(',{"ts":%d,"ph":"t","cat":"DataFlow","id":%d,"pid":%s,"tid":0,"name":"dep"}\n' % (to_ts, dep_id, str(to_pid))) dep_id += 1 + def metadata_json(self, jsonfile, sysinfo_file): + params = gen_params(sysinfo_file); + with open(jsonfile, mode='a') as fd: + cnt = 0 + fd.write('],\n') + fd.write('"otherData": {\n') + for nkey in sorted(params.keys()): + key = nkey[1] + cnt = cnt + 1 + if cnt == len(params): + fd.write(' "' + key + '": "' + params[nkey] + '"\n') + else: + fd.write(' "' + key + '": "' + params[nkey] + '",\n') + fd.write(' }\n') + def dump_json(self, table_name, data_name, file_name): if not re.search(r'\.json$', file_name): raise Exception('wrong output file type: "' + file_name + '"' ) @@ -231,19 +275,4 @@ def add_csv_table(self, table_name, file_name, extra = ()): table = self.add_table(table_name, descr, extra) self.insert_table(table, reader) - def metadata_json(self, jsonfile, sysinfo_file): - params = gen_params(sysinfo_file); - with open(jsonfile, mode='a') as fd: - cnt = 0 - fd.write('],\n') - fd.write('"otherData": {\n') - for key in params: - cnt = cnt + 1 - if cnt == len(params): - fd.write(' "' + key + '": "' + params[key] + '"\n') - else: - fd.write(' "' + key + '": "' + params[key] + '",\n') - fd.write(' }\n') - ############################################################################################## - diff --git a/bin/tblextr.py b/bin/tblextr.py index 3f47c65d..0fe46336 100755 --- a/bin/tblextr.py +++ b/bin/tblextr.py @@ -1,5 +1,3 @@ -#!/usr/bin/python - ################################################################################ # Copyright (c) 2018 Advanced Micro Devices, Inc. All rights reserved. # @@ -22,16 +20,10 @@ # THE SOFTWARE. ################################################################################ -import os, sys, re +import os, sys, re, subprocess from sqlitedb import SQLiteDB import dform -# Parsing results in the format: -#dispatch[0], queue_index(0), kernel_name("SimpleConvolution"), time(1048928000311041,1048928006154674,1048928006168274,1048928006170503): -# GRBM_GUI_ACTIVE (74332) -# SQ_WAVES (4096) -# SQ_INSTS_VMEM_RD (36864) - EXT_PID = 0 COPY_PID = 1 HIP_PID = 2 @@ -59,17 +51,6 @@ var_table = {} ############################################################# -def metadata_gen(sysinfo_file, sysinfo_cmd): - if not re.search(r'\.txt$', sysinfo_file): - raise Exception('wrong output file type: "' + sysinfo_file + '"' ) - if re.search(r'rocminfo', sysinfo_cmd): - direct_str = " > " - else: - direct_str = " >> " - status, output = commands.getstatusoutput(sysinfo_cmd + direct_str + sysinfo_file) - if status != 0 : - raise Exception('Could not run command: ' + sysinfo_cmd) - def fatal(msg): sys.stderr.write(sys.argv[0] + ": " + msg + "\n"); sys.exit(1) @@ -82,6 +63,22 @@ def dbglog(msg): fatal("error") ############################################################# +# Dumping sysinfo +sysinfo_begin = 1 +def metadata_gen(sysinfo_file, sysinfo_cmd): + global sysinfo_begin + if not re.search(r'\.txt$', sysinfo_file): + raise Exception('wrong output file type: "' + sysinfo_file + '"' ) + if sysinfo_begin == 1: + sysinfo_begin = 0 + with open(sysinfo_file, mode='w') as fd: fd.write('') + with open(sysinfo_file, mode='a') as fd: fd.write('CMD: ' + sysinfo_cmd + '\n') + status = subprocess.call(sysinfo_cmd + ' >> ' + sysinfo_file, + stderr=subprocess.STDOUT, + shell=True) + if status != 0: + raise Exception('Could not run command: "' + sysinfo_cmd + '"') + # parse results method def parse_res(infile): global max_gpu_id @@ -102,7 +99,7 @@ def parse_res(infile): if not dispatch_number in var_table: fatal("Error: dispatch number not found '" + str(dispatch_number) + "'") var = m.group(1) val = m.group(2) - var_table[dispatch_number][m.group(1)] = m.group(2) + var_table[dispatch_number][var] = val if not var in var_list: var_list.append(var) m = beg_pattern.match(record) @@ -256,18 +253,35 @@ def fill_ext_db(table_name, db, indir, trace_name, api_pid): rec_stack = pid_stack[tid] rec_vals = rec_stack.pop() rec_vals[1] = tms - + db.insert_entry(table_handle, rec_vals) record_id += 1 return 1 ############################################################# +def extract_field(rec_args, field): + ptrn1_field = re.compile(r'^.*'+field+'\('); + ptrn2_field = re.compile(r'\)\) .*$'); + (field_name, n_subs) = ptrn1_field.subn('', rec_args, count=1); + if n_subs != 0: + (field_name, n_subs) = ptrn2_field.subn(')', field_name, count=1) + return (field_name, n_subs) + # Fill API DB api_table_descr = [ ['BeginNs', 'EndNs', 'pid', 'tid', 'Name', 'args', 'Index'], {'BeginNs':'INTEGER', 'EndNs':'INTEGER', 'pid':'INTEGER', 'tid':'INTEGER', 'Name':'TEXT', 'args':'TEXT', 'Index':'INTEGER'} ] +# Filling API records DB table +# table_name - created DB table name +# db - DB handle +# indir - input directory +# api_name - traced API name +# api_pid - assigned JSON PID +# dep_pid - PID of dependet domain +# dep_list - list of dependet dospatch events +# dep_filtr - registered dependencies by record ID def fill_api_db(table_name, db, indir, api_name, api_pid, dep_pid, dep_list, dep_filtr, expl_id): global hsa_activity_found copy_raws = [] @@ -278,6 +292,10 @@ def fill_api_db(table_name, db, indir, api_name, api_pid, dep_pid, dep_list, dep file_name = indir + '/' + api_name + '_api_trace.txt' ptrn_val = re.compile(r'(\d+):(\d+) (\d+):(\d+) ([^\(]+)(\(.*)$') ptrn_ac = re.compile(r'hsa_amd_memory_async_copy') + ptrn1_kernel = re.compile(r'^.*kernel\(') + ptrn2_kernel = re.compile(r'\)\) .*$') + ptrn_fixformat = re.compile(r'(\d+:\d+ \d+:\d+ \w+)\(\s*(.*)\)$') + ptrn_fixkernel = re.compile(r'\s+kernel=(.*)$') if not os.path.isfile(file_name): return 0 @@ -285,36 +303,45 @@ def fill_api_db(table_name, db, indir, api_name, api_pid, dep_pid, dep_list, dep dep_from_us_list = [] dep_id_list = [] - global START_US - with open(file_name, mode='r') as fd: - line = fd.readline() - record = line[:-1] - m = ptrn_val.match(record) - if m: START_US = int(m.group(1)) / 1000 - START_US = 0 - + # parsing an input trace file and creating a DB table record_id = 0 table_handle = db.add_table(table_name, api_table_descr) with open(file_name, mode='r') as fd: for line in fd.readlines(): record = line[:-1] + + kernel_arg = '' + m = ptrn_fixkernel.search(record) + if m: + kernel_arg = 'kernel(' + m.group(1) + ') ' + record = ptrn_fixkernel.sub('', record) + + mfixformat = ptrn_fixformat.match(record) + if mfixformat: #replace '=' in args with parentheses + reformated_args = kernel_arg + mfixformat.group(2).replace('=','(').replace(',',')')+')' + record = mfixformat.group(1) + '(' + reformated_args + ')' + m = ptrn_val.match(record) if m: rec_vals = [] rec_len = len(api_table_descr[0]) for ind in range(1,rec_len): rec_vals.append(m.group(ind)) + proc_id = rec_vals[2] rec_vals[2] = api_pid rec_vals.append(record_id) db.insert_entry(table_handle, rec_vals) + + # dependencies filling if ptrn_ac.search(rec_vals[4]) or record_id in dep_filtr: beg_ns = int(rec_vals[0]) end_ns = int(rec_vals[1]) from_us = (beg_ns / 1000) + ((end_ns - beg_ns) / 1000) dep_from_us_list.append(from_us) dep_tid_list.append(int(rec_vals[3])) - dep_id_list.append(record_id) + dep_id_list.append(record_id) + # memcopy data if len(copy_raws) != 0: copy_data = list(copy_raws[copy_index]) args_str = rec_vals[5] @@ -324,13 +351,30 @@ def fill_api_db(table_name, db, indir, api_name, api_pid, dep_pid, dep_list, dep copy_csv += str(copy_index) + ', ' + copy_line + '\n' copy_index += 1 + # patching activity properties: kernel name, stream-id + corr_id = record_id + if (corr_id, proc_id) in dep_filtr: + record_args = rec_vals[rec_len - 2] + select_expr = '"Index" = ' + str(corr_id) + ' AND "proc-id" = ' + proc_id + # extract kernel name + (kernel_name, n_subs) = extract_field(record_args, 'kernel') + if n_subs != 0: + db.change_rec_fld('OPS', 'Name = "' + kernel_name + '"', select_expr) + # extract stream-id + (stream_id, n_subs) = extract_field(record_args, 'stream') + if n_subs != 0: + if stream_id == 'nil' or stream_id == 'NIL': stream_id = 0 + db.change_rec_fld('OPS', 'tid = ' + stream_id, select_expr) + record_id += 1 else: fatal(api_name + " bad record: '" + record + "'") + # inserting of dispatch events correlated to the dependent dispatches for (tid, from_ns) in dep_list: db.insert_entry(table_handle, [from_ns, from_ns, api_pid, tid, 'hsa_dispatch', '', record_id]) record_id += 1 + # registering dependencies informatino if dep_pid != NONE_PID: if not dep_pid in dep_dict: dep_dict[dep_pid] = {} dep_dict[dep_pid]['pid'] = api_pid @@ -338,6 +382,7 @@ def fill_api_db(table_name, db, indir, api_name, api_pid, dep_pid, dep_list, dep dep_dict[dep_pid]['from'] = dep_from_us_list if expl_id: dep_dict[dep_pid]['id'] = dep_id_list + # generating memcopy CSV if copy_csv != '': file_name = os.environ['PWD'] + '/results_mcopy.csv' with open(file_name, mode='w') as fd: @@ -386,46 +431,72 @@ def fill_copy_db(table_name, db, indir): # fill HCC ops DB ops_table_descr = [ - ['BeginNs', 'EndNs', 'dev-id', 'queue-id', 'Name', 'pid', 'tid', 'Index'], - {'Index':'INTEGER', 'Name':'TEXT', 'args':'TEXT', 'BeginNs':'INTEGER', 'EndNs':'INTEGER', 'dev-id':'INTEGER', 'queue-id':'INTEGER', 'pid':'INTEGER', 'tid':'INTEGER'} + ['BeginNs', 'EndNs', 'dev-id', 'queue-id', 'Name', 'pid', 'tid', 'Index', 'proc-id'], + {'Index':'INTEGER', 'proc-id':'INTEGER', 'Name':'TEXT', 'args':'TEXT', 'BeginNs':'INTEGER', 'EndNs':'INTEGER', 'dev-id':'INTEGER', 'queue-id':'INTEGER', 'pid':'INTEGER', 'tid':'INTEGER'} ] -def fill_ops_db(table_name, db, indir): +def fill_ops_db(kernel_table_name, mcopy_table_name, db, indir): global max_gpu_id file_name = indir + '/' + 'hcc_ops_trace.txt' ptrn_val = re.compile(r'(\d+):(\d+) (\d+):(\d+) (.*)$') - ptrn_id = re.compile(r'^[^:]+:(\d+)$') + ptrn_id = re.compile(r'^([^:]+):(\d+):(\d+)$') + ptrn_mcopy = re.compile(r'(Memcpy|Copy|Fill)') + ptrn_barrier = re.compile(r'Marker') if not os.path.isfile(file_name): return {} filtr = {} record_id = 0 - table_handle = db.add_table(table_name, ops_table_descr) + kernel_table_handle = db.add_table(kernel_table_name, ops_table_descr) + mcopy_table_handle = db.add_table(mcopy_table_name, ops_table_descr) with open(file_name, mode='r') as fd: for line in fd.readlines(): record = line[:-1] m = ptrn_val.match(record) if m: + # parsing trace record rec_vals = [] for ind in range(1,6): rec_vals.append(m.group(ind)) - gpu_id = int(rec_vals[2]); - if (gpu_id > max_gpu_id): max_gpu_id = gpu_id - gpu_pid = GPU_BASE_PID + int(gpu_id) - rec_vals.append(gpu_pid) - rec_vals.append(0) - m = ptrn_id.match(rec_vals[4]) + label = rec_vals[4] # record name + m = ptrn_id.match(label) if not m: fatal("bad hcc ops entry '" + record + "'") - corr_id = int(m.group(1)) - 1 - rec_vals.append(corr_id) + name = m.group(1) + corr_id = int(m.group(2)) - 1 + proc_id = m.group(3) + + # checking name for memcopy pattern + if ptrn_mcopy.search(name): + table_handle = mcopy_table_handle + pid = COPY_PID; + else: + table_handle = kernel_table_handle + + gpu_id = int(rec_vals[2]); + if (gpu_id > max_gpu_id): max_gpu_id = gpu_id + pid = GPU_BASE_PID + int(gpu_id) + + if ptrn_barrier.search(name): + name = '""' + + # insert DB record + rec_vals[4] = name # Name + rec_vals.append(pid) # pid + rec_vals.append(0) # tid + rec_vals.append(corr_id) # Index + rec_vals.append(proc_id) # proc-id db.insert_entry(table_handle, rec_vals) - filtr[corr_id] = 1 - if not gpu_pid in dep_dict: - dep_dict[gpu_pid] = {} - dep_dict[gpu_pid]['to'] = {} - dep_dict[gpu_pid]['to'][corr_id] = int(rec_vals[0]) / 1000 - dep_dict[gpu_pid]['bsp'] = OPS_PID - else: fatal("hcc ops bad record: '" + record + "'") + # registering a dependency filtr + filtr[(corr_id, proc_id)] = 1 + + # filling a dependency + if not pid in dep_dict: dep_dict[pid] = {} + if not 'to' in dep_dict[pid]: dep_dict[pid]['to'] = {} + dep_dict[pid]['to'][corr_id] = int(rec_vals[0]) / 1000 + dep_dict[pid]['bsp'] = OPS_PID + + else: + fatal("hcc ops bad record: '" + record + "'") return filtr ############################################################# @@ -462,8 +533,10 @@ def fill_ops_db(table_name, db, indir): hsa_statfile = re.sub(r'\.stats\.csv$', r'.hsa_stats.csv', statfile) hip_statfile = re.sub(r'\.stats\.csv$', r'.hip_stats.csv', statfile) kfd_statfile = re.sub(r'\.stats\.csv$', r'.kfd_stats.csv', statfile) - sysinfo_file = re.sub(r'\.stats\.csv$', r'.sysinfo_stats.txt', statfile) - metadata_gen(sysinfo_file, '/opt/rocm/bin/rocminfo') + ops_statfile = statfile + copy_statfile = re.sub(r'\.stats\.csv$', r'.copy_stats.csv', statfile) + sysinfo_file = re.sub(r'\.stats\.csv$', r'.sysinfo.txt', statfile) + metadata_gen(sysinfo_file, 'rocminfo') with open(dbfile, mode='w') as fd: fd.truncate() db = SQLiteDB(dbfile) @@ -475,12 +548,15 @@ def fill_ops_db(table_name, db, indir): hsa_activity_found = fill_copy_db('COPY', db, indir) hsa_trace_found = fill_api_db('HSA', db, indir, 'hsa', HSA_PID, COPY_PID, kern_dep_list, {}, 0) - ops_filtr = fill_ops_db('OPS', db, indir) + ops_filtr = fill_ops_db('OPS', 'COPY', db, indir) hip_trace_found = fill_api_db('HIP', db, indir, 'hip', HIP_PID, OPS_PID, [], ops_filtr, 1) fill_kernel_db('A', db) any_trace_found = ext_trace_found | kfd_trace_found | hsa_trace_found | hip_trace_found + copy_trace_found = 0 + if hsa_activity_found or len(ops_filtr): copy_trace_found = 1 + if any_trace_found: db.open_json(jsonfile) @@ -496,8 +572,7 @@ def fill_ops_db(table_name, db, indir): if kfd_trace_found: db.label_json(KFD_PID, "CPU KFD API", jsonfile) - if hsa_activity_found: - db.label_json(COPY_PID, "COPY", jsonfile) + db.label_json(COPY_PID, "COPY", jsonfile) if any_trace_found and max_gpu_id >= 0: for ind in range(0, int(max_gpu_id) + 1): @@ -517,8 +592,9 @@ def fill_ops_db(table_name, db, indir): dform.gen_table_bins(db, 'HSA', hsa_statfile, 'Name', 'DurationNs') dform.gen_api_json_trace(db, 'HSA', START_US, jsonfile) - if hsa_activity_found: + if copy_trace_found: dform.post_process_data(db, 'COPY') + dform.gen_table_bins(db, 'COPY', copy_statfile, 'Name', 'DurationNs') dform.gen_api_json_trace(db, 'COPY', START_US, jsonfile) if hip_trace_found: @@ -526,11 +602,11 @@ def fill_ops_db(table_name, db, indir): dform.gen_table_bins(db, 'HIP', hip_statfile, 'Name', 'DurationNs') dform.gen_api_json_trace(db, 'HIP', START_US, jsonfile) + if ops_filtr: dform.post_process_data(db, 'OPS') + dform.gen_table_bins(db, 'OPS', ops_statfile, 'Name', 'DurationNs') dform.gen_ops_json_trace(db, 'OPS', GPU_BASE_PID, START_US, jsonfile) - metadata_gen(sysinfo_file, '/opt/rocm/bin/hipcc --version') - if kfd_trace_found: dform.post_process_data(db, 'KFD') dform.gen_table_bins(db, 'KFD', kfd_statfile, 'Name', 'DurationNs') diff --git a/bin/txt2params.py b/bin/txt2params.py index 358acc3a..7944029f 100644 --- a/bin/txt2params.py +++ b/bin/txt2params.py @@ -1,5 +1,3 @@ -#!/usr/bin/python - ################################################################################ # Copyright (c) 2018 Advanced Micro Devices, Inc. All rights reserved. # @@ -30,13 +28,14 @@ # it also processes encompasing sections to generate a full param name such as (section names separated by '_'): # "Agent2_PoolInfo_ISAInfo_ISA1_WorkgroupMaxSizeperDimension_x": "1024(0x400)", def gen_params(txtfile): - fields = {} + fields = {} + counter = 0 parent_field = '' nbr_indent = 0 nbr_indent_prev = 0 check_for_dims = False - with open(txtfile) as fp: - for line in fp: + with open(txtfile) as fp: + for line in fp: me = re.match(r'\*\*\* Done \*\*\*',line) #Marks the end of cmd if me: parent_field = '' @@ -48,7 +47,8 @@ def gen_params(txtfile): if mv: key = 'HCCclangversion' val = mv.group(1) - fields[key] = val + counter = counter + 1 + fields[(counter,key)] = val continue # Variable 'check_for_dims' is True for text like this: # Workgroup Max Size per Dimension: @@ -56,34 +56,35 @@ def gen_params(txtfile): # y 1024(0x400) # z 1024(0x400) if check_for_dims == True: - mc = re.match(r'\s*([x|y|z])\s+(.*)',line) + mc = re.match(r'\s*([x|y|z])\s+(.*)',line) if mc: key_sav = mc.group(1) if parent_field != '': - key = parent_field + '_' + mc.group(1) + key = parent_field + '.' + mc.group(1) else: key = mc.group(1) val = re.sub(r"\s+", "", mc.group(2)) - fields[key] = val + counter = counter + 1 + fields[(counter,key)] = val if key_sav == 'z': check_for_dims = False nbr_indent_prev = nbr_indent - mi = re.search(r'^(\s+)\w+', line) + mi = re.search(r'^(\s+)\w+.*', line) md = re.search(':', line) if mi: - nbr_indent = len(mi.group(1)) / 2 #indentation cnt + nbr_indent = int(len(mi.group(1)) / 2) #indentation cnt else: if not md: tmp = re.sub(r"\s+", "", line) if tmp.isalnum(): parent_field = tmp - continue - - if nbr_indent < nbr_indent_prev: - pos = parent_field.rfind('_') - if pos != -1: - parent_field = parent_field[:pos] + if nbr_indent < nbr_indent_prev: + go_back_parent = (nbr_indent_prev - nbr_indent) + for i in range(go_back_parent): #decrease as many levels up as needed + pos = parent_field.rfind('.') + if pos != -1: + parent_field = parent_field[:pos] # Process lines such as : # Segment: GLOBAL; FLAGS: KERNARG, FINE GRAINED # Size: 131897644(0x7dc992c) KB @@ -93,17 +94,17 @@ def gen_params(txtfile): if m: key, val = m.group(1), m.group(2) if parent_field != '': - key = parent_field + '_' + key + key = parent_field + '.' + key if val == '': mk = re.match(r'.*Dimension',key) if mk: # expect x,y,z on next 3 lines check_for_dims = True - parent_field = key + parent_field = key else: - fields[key] = val + counter = counter + 1 + fields[(counter,key)] = val else: if nbr_indent != nbr_indent_prev and not check_for_dims : - parent_field = parent_field + '_' + lin.replace(':','') - - return fields + parent_field = parent_field + '.' + lin.replace(':','') + return fields diff --git a/cmake_modules/env.cmake b/cmake_modules/env.cmake index 44fb0cd0..30e86c13 100644 --- a/cmake_modules/env.cmake +++ b/cmake_modules/env.cmake @@ -31,7 +31,6 @@ add_definitions ( -DUNIX_OS ) add_definitions ( -DLINUX ) add_definitions ( -D__AMD64__ ) add_definitions ( -D__x86_64__ ) -add_definitions ( -DAMD_INTERNAL_BUILD ) add_definitions ( -DLITTLEENDIAN_CPU=1 ) add_definitions ( -DHSA_LARGE_MODEL= ) add_definitions ( -DHSA_DEPRECATED= ) @@ -109,16 +108,14 @@ elseif ( ${CMAKE_SYSTEM_PROCESSOR} STREQUAL "x86" ) endif () ## Find hsa-runtime headers/lib -find_file ( HSA_RUNTIME_INC "hsa.h" ) -if ( "${HSA_RUNTIME_INC_PATH}" STREQUAL "" ) - find_file ( HSA_RUNTIME_INC "hsa/hsa.h" ) -endif() +find_file ( HSA_RUNTIME_INC "hsa/hsa.h" ) find_library ( HSA_RUNTIME_LIB "libhsa-runtime${NBIT}.so" ) -get_filename_component ( HSA_RUNTIME_INC_PATH ${HSA_RUNTIME_INC} DIRECTORY ) -get_filename_component ( HSA_RUNTIME_LIB_PATH ${HSA_RUNTIME_LIB} DIRECTORY ) +get_filename_component ( HSA_RUNTIME_INC_PATH "${HSA_RUNTIME_INC}" DIRECTORY ) +get_filename_component ( HSA_RUNTIME_LIB_PATH "${HSA_RUNTIME_LIB}" DIRECTORY ) find_library ( HSA_KMT_LIB "libhsakmt.so" ) -get_filename_component ( HSA_KMT_LIB_PATH ${HSA_KMT_LIB} DIRECTORY ) +get_filename_component ( HSA_KMT_LIB_PATH "${HSA_KMT_LIB}" DIRECTORY ) +get_filename_component ( ROCM_ROOT_DIR "${HSA_KMT_LIB_PATH}" DIRECTORY ) ## Basic Tool Chain Information message ( "----------------NBit: ${NBIT}" ) @@ -127,5 +124,22 @@ message ( "------------Compiler: ${CMAKE_CXX_COMPILER}" ) message ( "----Compiler-Version: ${CMAKE_CXX_COMPILER_VERSION}" ) message ( "-----HSA-Runtime-Inc: ${HSA_RUNTIME_INC_PATH}" ) message ( "-----HSA-Runtime-Lib: ${HSA_RUNTIME_LIB_PATH}" ) -message ( "-----------CXX-Flags: ${CMAKE_CXX_FLAGS}" ) +message ( "----HSA_KMT_LIB_PATH: ${HSA_KMT_LIB_PATH}" ) +message ( "-------ROCM_ROOT_DIR: ${ROCM_ROOT_DIR}" ) +message ( "-----CMAKE_CXX_FLAGS: ${CMAKE_CXX_FLAGS}" ) message ( "---CMAKE_PREFIX_PATH: ${CMAKE_PREFIX_PATH}" ) +message ( "---------GPU_TARGETS: ${GPU_TARGETS}" ) + +## Check the ROCm pathes +if ( "${HSA_RUNTIME_INC_PATH}" STREQUAL "" ) + message ( FATAL_ERROR "HSA_RUNTIME_INC_PATH is not found." ) +endif () +if ( "${HSA_RUNTIME_LIB_PATH}" STREQUAL "" ) + message ( FATAL_ERROR "HSA_RUNTIME_LIB_PATH is not found." ) +endif () +if ( "${HSA_KMT_LIB_PATH}" STREQUAL "" ) + message ( FATAL_ERROR "HSA_KMT_LIB_PATH is not found." ) +endif () +if ( "${ROCM_ROOT_DIR}" STREQUAL "" ) + message ( FATAL_ERROR "ROCM_ROOT_DIR is not found." ) +endif () diff --git a/inc/rocprofiler.h b/inc/rocprofiler.h index 31082cf4..24925cae 100644 --- a/inc/rocprofiler.h +++ b/inc/rocprofiler.h @@ -41,12 +41,13 @@ THE SOFTWARE. #ifndef INC_ROCPROFILER_H_ #define INC_ROCPROFILER_H_ -#include #include +#include +#include #include #include -#define ROCPROFILER_VERSION_MAJOR 7 +#define ROCPROFILER_VERSION_MAJOR 8 #define ROCPROFILER_VERSION_MINOR 0 #ifdef __cplusplus @@ -70,6 +71,7 @@ typedef struct { uint32_t trace_local; uint64_t timeout; uint32_t timestamp_on; + uint32_t hsa_intercepting; } rocprofiler_settings_t; //////////////////////////////////////////////////////////////////////////////// @@ -87,7 +89,9 @@ hsa_status_t rocprofiler_error_string( // Profiling feature kind typedef enum { ROCPROFILER_FEATURE_KIND_METRIC = 0, - ROCPROFILER_FEATURE_KIND_TRACE = 1 + ROCPROFILER_FEATURE_KIND_TRACE = 1, + ROCPROFILER_FEATURE_KIND_SPM_MOD = 2, + ROCPROFILER_FEATURE_KIND_PCSMP_MOD = 4 } rocprofiler_feature_kind_t; // Profiling feture parameter @@ -199,17 +203,25 @@ hsa_status_t rocprofiler_close(rocprofiler_t* context); // [in] profiling conte hsa_status_t rocprofiler_reset(rocprofiler_t* context, // [in] profiling context uint32_t group_index); // group index +// Return context agent +hsa_status_t rocprofiler_get_agent(rocprofiler_t* context, // [in] profiling context + hsa_agent_t* agent); // [out] GPU handle + // Supported time value ID typedef enum { ROCPROFILER_TIME_ID_CLOCK_REALTIME = 0, // Linux realtime clock time - ROCPROFILER_TIME_ID_CLOCK_MONOTONIC = 1, // Linux monotonic clock time + ROCPROFILER_TIME_ID_CLOCK_REALTIME_COARSE = 1, // Linux realtime-coarse clock time + ROCPROFILER_TIME_ID_CLOCK_MONOTONIC = 2, // Linux monotonic clock time + ROCPROFILER_TIME_ID_CLOCK_MONOTONIC_COARSE = 3, // Linux monotonic-coarse clock time + ROCPROFILER_TIME_ID_CLOCK_MONOTONIC_RAW = 4, // Linux monotonic-raw clock time } rocprofiler_time_id_t; // Return time value for a given time ID and profiling timestamp hsa_status_t rocprofiler_get_time( rocprofiler_time_id_t time_id, // identifier of the particular time to convert the timesatmp uint64_t timestamp, // profiling timestamp - uint64_t* value_ns); // [out] returned time 'ns' value + uint64_t* value_ns, // [out] returned time 'ns' value, ignored if NULL + uint64_t* error_ns); // [out] returned time error 'ns' value, ignored if NULL //////////////////////////////////////////////////////////////////////////////// // Queue callbacks @@ -237,7 +249,7 @@ typedef struct { const char* kernel_name; // Kernel name uint64_t kernel_object; // Kernel object address const amd_kernel_code_t* kernel_code; // Kernel code pointer - int64_t thread_id; // Thread id + uint32_t thread_id; // Thread id const rocprofiler_dispatch_record_t* record; // Dispatch record } rocprofiler_callback_data_t; @@ -262,6 +274,10 @@ hsa_status_t rocprofiler_set_queue_callbacks( // Remove queue callbacks hsa_status_t rocprofiler_remove_queue_callbacks(); +// Start/stop queue callbacks +hsa_status_t rocprofiler_start_queue_callbacks(); +hsa_status_t rocprofiler_stop_queue_callbacks(); + //////////////////////////////////////////////////////////////////////////////// // Start/stop profiling // @@ -455,6 +471,68 @@ hsa_status_t rocprofiler_pool_flush( rocprofiler_pool_t* pool); // profiling pool handle //////////////////////////////////////////////////////////////////////////////// +// HSA intercepting API + +// HSA callbacks ID enumeration +typedef enum { + ROCPROFILER_HSA_CB_ID_ALLOCATE = 0, // Memory allocate callback + ROCPROFILER_HSA_CB_ID_DEVICE = 1, // Device assign callback + ROCPROFILER_HSA_CB_ID_MEMCOPY = 2, // Memcopy callback + ROCPROFILER_HSA_CB_ID_SUBMIT = 3 // Packet submit callback +} rocprofiler_hsa_cb_id_t; + +// HSA callback data type +typedef struct { + union { + struct { + const void* ptr; // allocated area ptr + size_t size; // allocated area size, zero size means 'free' callback + hsa_amd_segment_t segment; // allocated area's memory segment type + hsa_amd_memory_pool_global_flag_t global_flag; // allocated area's memory global flag + int is_code; // equal to 1 if code is allocated + } allocate; + struct { + hsa_device_type_t type; // type of assigned device + uint32_t id; // id of assigned device + hsa_agent_t agent; // device HSA agent handle + const void* ptr; // ptr the device is assigned to + } device; + struct { + const void* dst; // memcopy dst ptr + const void* src; // memcopy src ptr + size_t size; // memcopy size bytes + } memcopy; + struct { + const void* packet; // submitted to GPU packet + const char* kernel_name; // kernel name, not NULL if dispatch + hsa_queue_t* queue; // HSA queue the kernel was submitted to + uint32_t device_type; // type of device the packed is submitted to + uint32_t device_id; // id of device the packed is submitted to + } submit; + }; +} rocprofiler_hsa_callback_data_t; + +// HSA callback function type +typedef hsa_status_t (*rocprofiler_hsa_callback_fun_t)( + rocprofiler_hsa_cb_id_t id, // callback id + const rocprofiler_hsa_callback_data_t* data, // [in] callback data + void* arg); // [in/out] user passed data + +// HSA callbacks structure +typedef struct { + rocprofiler_hsa_callback_fun_t allocate; // memory allocate callback + rocprofiler_hsa_callback_fun_t device; // agent assign callback + rocprofiler_hsa_callback_fun_t memcopy; // memory copy callback + rocprofiler_hsa_callback_fun_t submit; // packet submit callback +} rocprofiler_hsa_callbacks_t; + +// Set callbacks. If the callback is NULL then it is disabled. +// If callback returns a value that is not HSA_STATUS_SUCCESS the callback +// will be unregistered. +hsa_status_t rocprofiler_set_hsa_callbacks( + const rocprofiler_hsa_callbacks_t callbacks, // HSA callback function + void* arg); // callback user data + #ifdef __cplusplus } // extern "C" block #endif // __cplusplus diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 9a398411..4c97ea6f 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -30,8 +30,9 @@ set ( LIB_SRC ${LIB_DIR}/core/simple_proxy_queue.cpp ${LIB_DIR}/core/intercept_queue.cpp ${LIB_DIR}/core/metrics.cpp + ${LIB_DIR}/core/activity.cpp ${LIB_DIR}/util/hsa_rsrc_factory.cpp ) add_library ( ${TARGET_LIB} SHARED ${LIB_SRC} ) -target_include_directories ( ${TARGET_LIB} PRIVATE ${LIB_DIR} ${ROOT_DIR} ${HSA_RUNTIME_INC_PATH} ) +target_include_directories ( ${TARGET_LIB} PRIVATE ${LIB_DIR} ${ROOT_DIR} ${HSA_RUNTIME_INC_PATH} ${HSA_KMT_LIB_PATH}/.. ) target_link_libraries( ${TARGET_LIB} PRIVATE ${HSA_RUNTIME_LIB} c stdc++) diff --git a/src/core/activity.cpp b/src/core/activity.cpp new file mode 100644 index 00000000..c72977e1 --- /dev/null +++ b/src/core/activity.cpp @@ -0,0 +1,171 @@ +/****************************************************************************** +Copyright (c) 2018 Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*******************************************************************************/ + +#include +#include +#include +#include + +#include +#include + +// Tracer messages protocol +#include + +#include "core/context.h" +#include "inc/rocprofiler.h" +#include "util/hsa_rsrc_factory.h" + +#define PUBLIC_API __attribute__((visibility("default"))) + +// Error handler +void fatal(const std::string msg) { + fflush(stdout); + fprintf(stderr, "%s\n\n", msg.c_str()); + fflush(stderr); + abort(); +} + +// Check returned HSA API status +void check_status(hsa_status_t status) { + if (status != HSA_STATUS_SUCCESS) { + const char* error_string = NULL; + rocprofiler_error_string(&error_string); + fprintf(stderr, "ERROR: %s\n", error_string); + abort(); + } +} + +// Activity primitives +namespace activity_prim { +// PC sampling callback data +struct pcsmp_callback_data_t { + const char* kernel_name; // sampled kernel name + void* data_buffer; // host buffer for tracing data + uint64_t id; // sample id + uint64_t cycle; // sample cycle + uint64_t pc; // sample PC +}; + +uint32_t activity_op = UINT32_MAX; +void* activity_arg = NULL; +std::atomic activity_callback{NULL}; +rocprofiler_t* context = NULL; + +hsa_status_t trace_data_cb(hsa_ven_amd_aqlprofile_info_type_t info_type, + hsa_ven_amd_aqlprofile_info_data_t* info_data, + void* data) { + const pcsmp_callback_data_t* pcsmp_data = (pcsmp_callback_data_t*) data; + + activity_record_t record{}; + record.op = activity_op; + record.pc_sample.se = pcsmp_data->id; + record.pc_sample.cycle = pcsmp_data->cycle; + record.pc_sample.pc = pcsmp_data->pc; + activity_async_callback_t fun = activity_callback.load(std::memory_order_acquire); + if (fun) { + (fun)(activity_op, &record, activity_arg); + } else { + free((void*)(pcsmp_data->kernel_name)); + } + return HSA_STATUS_SUCCESS; +} + +bool context_handler(rocprofiler_group_t group, void* arg) { + hsa_agent_t agent{}; + hsa_status_t status = rocprofiler_get_agent(group.context, &agent); + check_status(status); + const rocprofiler::util::AgentInfo* agent_info = rocprofiler::util::HsaRsrcFactory::Instance().GetAgentInfo(agent); + + pcsmp_callback_data_t pcsmp_data{}; + pcsmp_data.kernel_name = (const char*)arg; + pcsmp_data.data_buffer = rocprofiler::util::HsaRsrcFactory::Instance().AllocateSysMemory(agent_info, rocprofiler::TraceProfile::GetSize()); + status = rocprofiler_iterate_trace_data(group.context, trace_data_cb, &pcsmp_data); + check_status(status); + return false; +} + +// Kernel disoatch callback +hsa_status_t dispatch_callback(const rocprofiler_callback_data_t* callback_data, void* user_data, + rocprofiler_group_t* group) { + // context features + const rocprofiler_feature_kind_t trace_kind = + (rocprofiler_feature_kind_t)(ROCPROFILER_FEATURE_KIND_TRACE | ROCPROFILER_FEATURE_KIND_PCSMP_MOD); + const uint32_t feature_count = 1; + const uint32_t parameter_count = 1; + rocprofiler_feature_t* features = new rocprofiler_feature_t[feature_count]; + memset(features, 0, feature_count * sizeof(rocprofiler_feature_t)); + rocprofiler_parameter_t* parameters = new rocprofiler_parameter_t[parameter_count]; + memset(features, 0, parameter_count * sizeof(rocprofiler_parameter_t)); + parameters[0].parameter_name = HSA_VEN_AMD_AQLPROFILE_PARAMETER_NAME_COMPUTE_UNIT_TARGET; + parameters[0].value = 0; + + features[0].kind = trace_kind; + features[0].parameters = parameters; + features[0].parameter_count = parameter_count; + + // context properties + rocprofiler_properties_t properties{}; + properties.handler = context_handler; + properties.handler_arg = (void*)strdup(callback_data->kernel_name); + + // Open profiling context + hsa_status_t status = rocprofiler_open(callback_data->agent, features, feature_count, + &context, 0 /*ROCPROFILER_MODE_SINGLEGROUP*/, &properties); + check_status(status); + + // Get group[0] + status = rocprofiler_get_group(context, 0, group); + check_status(status); + + return status; +} +} // namespace activity_prim + +extern "C" { +PUBLIC_API const char* GetOpName(uint32_t op) { return strdup("PCSAMPLE"); } + +PUBLIC_API bool RegisterApiCallback(uint32_t op, void* callback, void* arg) { return true; } + +PUBLIC_API bool RemoveApiCallback(uint32_t op) { return true; } + +PUBLIC_API bool InitActivityCallback(void* callback, void* arg) { + activity_prim::activity_arg = arg; + activity_prim::activity_callback.store((activity_async_callback_t)callback, std::memory_order_release); + + rocprofiler_queue_callbacks_t queue_callbacks{}; + queue_callbacks.dispatch = activity_prim::dispatch_callback; + rocprofiler_set_queue_callbacks(queue_callbacks, NULL); + + return true; +} + +PUBLIC_API bool EnableActivityCallback(uint32_t op, bool enable) { + if (enable) { + activity_prim::activity_op = op; + rocprofiler_start_queue_callbacks(); + } else { + rocprofiler_stop_queue_callbacks(); + } + return true; +} +} // extern "C" diff --git a/src/core/context.h b/src/core/context.h index f3ab1294..7131d338 100644 --- a/src/core/context.h +++ b/src/core/context.h @@ -257,7 +257,7 @@ class Context { void GetMetricsData() const { const MetricArgs args(info_map_); - for (const auto &v : metrics_map_) { + for (const auto& v : metrics_map_) { const std::string& name = v.first; const Metric* metric = v.second; const xml::Expr* expr = metric->GetExpr(); @@ -276,6 +276,7 @@ class Context { profile_vector_t profile_vector; set_[0].GetTraceProfiles(profile_vector); for (auto& tuple : profile_vector) { + if (pcsmp_mode_) const_cast(tuple.profile)->event_count = UINT32_MAX; const hsa_status_t status = api_->hsa_ven_amd_aqlprofile_iterate_data(tuple.profile, callback, data); if (status != HSA_STATUS_SUCCESS) AQL_EXC_RAISING(status, "context iterate data failed"); @@ -293,6 +294,7 @@ class Context { return false; } + hsa_agent_t GetAgent() const { return agent_; } Group* GetGroup(const uint32_t& index) { return &set_[index]; } rocprofiler_handler_t GetHandler(void** arg) const { *arg = handler_arg_; return handler_; } @@ -306,7 +308,8 @@ class Context { api_(hsa_rsrc_->AqlProfileApi()), metrics_(NULL), handler_(handler), - handler_arg_(handler_arg) + handler_arg_(handler_arg), + pcsmp_mode_(false) {} ~Context() { Destruct(); } @@ -434,10 +437,13 @@ class Context { const uint32_t group_index = block_status.group_index; set_[group_index].Insert(profile_info_t{event, NULL, 0, info}); } - } else if (kind == ROCPROFILER_FEATURE_KIND_TRACE) { // Processing traces features - if (info->parameters != NULL) { - set_[0].Insert(profile_info_t{NULL, info->parameters, info->parameter_count, info}); - } else { + } else if (kind & ROCPROFILER_FEATURE_KIND_TRACE) { // Processing traces features + info->kind = ROCPROFILER_FEATURE_KIND_TRACE; + + const event_t* event = NULL; + if (kind & ROCPROFILER_FEATURE_KIND_PCSMP_MOD) { // PC sampling + pcsmp_mode_ = true; + } else if (kind & ROCPROFILER_FEATURE_KIND_SPM_MOD) { // SPM trace const Metric* metric = metrics_->Get(name); if (metric == NULL) EXC_RAISING(HSA_STATUS_ERROR, "input metric '" << name << "' is not found"); @@ -445,9 +451,9 @@ class Context { if (counters_vec.size() != 1) EXC_RAISING(HSA_STATUS_ERROR, "trace bad metric '" << name << "' is not base counter"); const counter_t* counter = counters_vec[0]; - const event_t* event = &(counter->event); - set_[0].Insert(profile_info_t{event, NULL, 0, info}); + event = &(counter->event); } + set_[0].Insert(profile_info_t{event, info->parameters, info->parameter_count, info}); } else { EXC_RAISING(HSA_STATUS_ERROR, "bad rocprofiler feature kind (" << kind << ")"); } @@ -584,6 +590,9 @@ class Context { // Context completion handler rocprofiler_handler_t handler_; void* handler_arg_; + + // PC sampling mode + bool pcsmp_mode_; }; } // namespace rocprofiler diff --git a/src/core/hsa_interceptor.h b/src/core/hsa_interceptor.h new file mode 100644 index 00000000..f1d8a0d8 --- /dev/null +++ b/src/core/hsa_interceptor.h @@ -0,0 +1,385 @@ +/****************************************************************************** +MIT License + +Copyright (c) 2018 ROCm Core Technology + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. +*******************************************************************************/ + +#ifndef _SRC_CORE_HSA_INTERCEPTOR_H +#define _SRC_CORE_HSA_INTERCEPTOR_H + +#include +#include +#include + +#include +#include + +#include "inc/rocprofiler.h" +#include "util/exception.h" +#include "util/hsa_rsrc_factory.h" + +#define HSA_RT(call) \ + do { \ + const hsa_status_t status = call; \ + if (status != HSA_STATUS_SUCCESS) EXC_ABORT(status, #call); \ + } while(0) + +#define IS_HSA_CALLBACK(ID) \ + const auto __id = ID; (void)__id; \ + void *__arg = arg_.load(); (void)__arg; \ + rocprofiler_hsa_callback_fun_t __callback = \ + (ID == ROCPROFILER_HSA_CB_ID_ALLOCATE) ? callbacks_.allocate: \ + (ID == ROCPROFILER_HSA_CB_ID_DEVICE) ? callbacks_.device: \ + (ID == ROCPROFILER_HSA_CB_ID_MEMCOPY) ? callbacks_.memcopy: \ + callbacks_.submit; \ + if ((__callback != NULL) && (recursion_ == false)) + +#define DO_HSA_CALLBACK \ + do { \ + recursion_ = true; \ + __callback(__id, &data, __arg); \ + recursion_ = false; \ + } while (0) + +#define ISSUE_HSA_CALLBACK(ID) \ + do { IS_HSA_CALLBACK(ID) { DO_HSA_CALLBACK; } } while(0) + +namespace rocprofiler { +extern decltype(hsa_memory_allocate)* hsa_memory_allocate_fn; +extern decltype(hsa_memory_assign_agent)* hsa_memory_assign_agent_fn; +extern decltype(hsa_memory_copy)* hsa_memory_copy_fn; +extern decltype(hsa_amd_memory_pool_allocate)* hsa_amd_memory_pool_allocate_fn; +extern decltype(hsa_amd_memory_pool_free)* hsa_amd_memory_pool_free_fn; +extern decltype(hsa_amd_agents_allow_access)* hsa_amd_agents_allow_access_fn; +extern decltype(hsa_amd_memory_async_copy)* hsa_amd_memory_async_copy_fn; +extern decltype(hsa_executable_freeze)* hsa_executable_freeze_fn; +extern decltype(hsa_executable_destroy)* hsa_executable_destroy_fn; + +class HsaInterceptor { + public: + typedef std::atomic arg_t; + typedef std::mutex mutex_t; + + static void Enable(const bool& enable) { enable_ = enable; } + + static void HsaIntercept(HsaApiTable* table) { + if (enable_) { + // Fetching AMD Loader HSA extension API + HSA_RT(hsa_system_get_major_extension_table( + HSA_EXTENSION_AMD_LOADER, + 1, + sizeof(hsa_ven_amd_loader_1_01_pfn_t), + &LoaderApiTable)); + + // Saving original API functions + hsa_memory_allocate_fn = table->core_->hsa_memory_allocate_fn; + hsa_memory_assign_agent_fn = table->core_->hsa_memory_assign_agent_fn; + hsa_memory_copy_fn = table->core_->hsa_memory_copy_fn; + hsa_amd_memory_pool_allocate_fn = table->amd_ext_->hsa_amd_memory_pool_allocate_fn; + hsa_amd_memory_pool_free_fn = table->amd_ext_->hsa_amd_memory_pool_free_fn; + hsa_amd_agents_allow_access_fn = table->amd_ext_->hsa_amd_agents_allow_access_fn; + hsa_amd_memory_async_copy_fn = table->amd_ext_->hsa_amd_memory_async_copy_fn; + hsa_executable_freeze_fn = table->core_->hsa_executable_freeze_fn; + hsa_executable_destroy_fn = table->core_->hsa_executable_destroy_fn; + + // Intercepting HSA API + table->core_->hsa_memory_allocate_fn = MemoryAllocate; + table->core_->hsa_memory_assign_agent_fn = MemoryAssignAgent; + table->core_->hsa_memory_copy_fn = MemoryCopy; + table->amd_ext_->hsa_amd_memory_pool_allocate_fn = MemoryPoolAllocate; + table->amd_ext_->hsa_amd_memory_pool_free_fn = MemoryPoolFree; + table->amd_ext_->hsa_amd_agents_allow_access_fn = AgentsAllowAccess; + table->amd_ext_->hsa_amd_memory_async_copy_fn = MemoryAsyncCopy; + table->core_->hsa_executable_freeze_fn = ExecutableFreeze; + table->core_->hsa_executable_destroy_fn = ExecutableDestroy; + } + } + + static void SetCallbacks(rocprofiler_hsa_callbacks_t callbacks, void* arg) { + std::lock_guard lck(mutex_); + callbacks_ = callbacks; + arg_.store(arg); + } + + private: + static hsa_status_t HSA_API MemoryAllocate(hsa_region_t region, + size_t size, + void** ptr) + { + hsa_status_t status = HSA_STATUS_SUCCESS; + HSA_RT(hsa_memory_allocate_fn(region, size, ptr)); + IS_HSA_CALLBACK(ROCPROFILER_HSA_CB_ID_ALLOCATE) { + rocprofiler_hsa_callback_data_t data{}; + data.allocate.ptr = *ptr; + data.allocate.size = size; + + HSA_RT(hsa_region_get_info(region, HSA_REGION_INFO_SEGMENT, &data.allocate.segment)); + HSA_RT(hsa_region_get_info(region, HSA_REGION_INFO_GLOBAL_FLAGS, &data.allocate.global_flag)); + + DO_HSA_CALLBACK; + } + return status; + } + + static hsa_status_t MemoryAssignAgent( + void *ptr, + hsa_agent_t agent, + hsa_access_permission_t access) + { + hsa_status_t status = HSA_STATUS_SUCCESS; + HSA_RT(hsa_memory_assign_agent_fn(ptr, agent, access)); + IS_HSA_CALLBACK(ROCPROFILER_HSA_CB_ID_DEVICE) { + rocprofiler_hsa_callback_data_t data{}; + data.device.ptr = ptr; + + HSA_RT(hsa_agent_get_info(agent, HSA_AGENT_INFO_DEVICE, &data.device.type)); + + DO_HSA_CALLBACK; + } + return status; + } + + // Spawn device allow access callback + static void DeviceCallback( + uint32_t num_agents, + const hsa_agent_t* agents, + const void* ptr) + { + for (const hsa_agent_t* agent_p = agents; agent_p < (agents + num_agents); ++agent_p) { + hsa_agent_t agent = *agent_p; + rocprofiler_hsa_callback_data_t data{}; + data.device.id = util::HsaRsrcFactory::Instance().GetAgentInfo(agent)->dev_index; + data.device.agent = agent; + data.device.ptr = ptr; + + HSA_RT(hsa_agent_get_info(agent, HSA_AGENT_INFO_DEVICE, &data.device.type)); + + ISSUE_HSA_CALLBACK(ROCPROFILER_HSA_CB_ID_DEVICE); + } + } + + // Agent allow access callback 'hsa_amd_agents_allow_access' + static hsa_status_t AgentsAllowAccess( + uint32_t num_agents, + const hsa_agent_t* agents, + const uint32_t* flags, + const void* ptr) + { + hsa_status_t status = HSA_STATUS_SUCCESS; + HSA_RT(hsa_amd_agents_allow_access_fn(num_agents, agents, flags, ptr)); + IS_HSA_CALLBACK(ROCPROFILER_HSA_CB_ID_DEVICE) { + DeviceCallback(num_agents, agents, ptr); + } + return status; + } + + // Callback function to get available in the system agents + struct agent_callback_data_t { + hsa_amd_memory_pool_t pool; + void* ptr; + }; + static hsa_status_t AgentCallback(hsa_agent_t agent, void* data) { + agent_callback_data_t* callback_data = reinterpret_cast(data); + hsa_amd_agent_memory_pool_info_t attribute = HSA_AMD_AGENT_MEMORY_POOL_INFO_ACCESS; + hsa_amd_memory_pool_access_t value; + HSA_RT(hsa_amd_agent_memory_pool_get_info(agent, callback_data->pool, attribute, &value)); + if (value == HSA_AMD_MEMORY_POOL_ACCESS_ALLOWED_BY_DEFAULT) { + DeviceCallback(1, &agent, callback_data->ptr); + } + return HSA_STATUS_SUCCESS; + } + + static hsa_status_t MemoryPoolAllocate( + hsa_amd_memory_pool_t pool, + size_t size, + uint32_t flags, + void** ptr) + { + hsa_status_t status = HSA_STATUS_SUCCESS; + HSA_RT(hsa_amd_memory_pool_allocate_fn(pool, size, flags, ptr)); + if (size != 0) { + IS_HSA_CALLBACK(ROCPROFILER_HSA_CB_ID_ALLOCATE) { + rocprofiler_hsa_callback_data_t data{}; + data.allocate.ptr = *ptr; + data.allocate.size = size; + + HSA_RT(hsa_amd_memory_pool_get_info(pool, HSA_AMD_MEMORY_POOL_INFO_SEGMENT, &data.allocate.segment)); + HSA_RT(hsa_amd_memory_pool_get_info(pool, HSA_AMD_MEMORY_POOL_INFO_GLOBAL_FLAGS, &data.allocate.global_flag)); + + DO_HSA_CALLBACK; + + IS_HSA_CALLBACK(ROCPROFILER_HSA_CB_ID_DEVICE) { + // Scan the pool assigned devices + agent_callback_data_t callback_data{pool, *ptr}; + hsa_iterate_agents(AgentCallback, &callback_data); + } + } + } + return status; + } + static hsa_status_t MemoryPoolFree( + void* ptr) + { + hsa_status_t status = HSA_STATUS_SUCCESS; + IS_HSA_CALLBACK(ROCPROFILER_HSA_CB_ID_ALLOCATE) { + rocprofiler_hsa_callback_data_t data{}; + data.allocate.ptr = ptr; + data.allocate.size = 0; + DO_HSA_CALLBACK; + } + HSA_RT(hsa_amd_memory_pool_free_fn(ptr)); + return status; + } + + static hsa_status_t MemoryCopy( + void *dst, + const void *src, + size_t size) + { + hsa_status_t status = HSA_STATUS_SUCCESS; + HSA_RT(hsa_memory_copy_fn(dst, src, size)); + IS_HSA_CALLBACK(ROCPROFILER_HSA_CB_ID_MEMCOPY) { + rocprofiler_hsa_callback_data_t data{}; + data.memcopy.dst = dst; + data.memcopy.src = src; + data.memcopy.size = size; + DO_HSA_CALLBACK; + } + return status; + } + + static hsa_status_t MemoryAsyncCopy( + void* dst, hsa_agent_t dst_agent, const void* src, + hsa_agent_t src_agent, size_t size, + uint32_t num_dep_signals, + const hsa_signal_t* dep_signals, + hsa_signal_t completion_signal) + { + hsa_status_t status = HSA_STATUS_SUCCESS; + HSA_RT(hsa_amd_memory_async_copy_fn( + dst, dst_agent, src, src_agent, size, + num_dep_signals, dep_signals, completion_signal)); + IS_HSA_CALLBACK(ROCPROFILER_HSA_CB_ID_MEMCOPY) { + rocprofiler_hsa_callback_data_t data{}; + data.memcopy.dst = dst; + data.memcopy.src = src; + data.memcopy.size = size; + DO_HSA_CALLBACK; + } + return status; + } + + static hsa_status_t CodeObjectCallback( + hsa_executable_t executable, + hsa_loaded_code_object_t loaded_code_object, + void* arg) + { + const int free_flag = reinterpret_cast(arg); + rocprofiler_hsa_callback_data_t data{}; + + HSA_RT(LoaderApiTable.hsa_ven_amd_loader_loaded_code_object_get_info( + loaded_code_object, + HSA_VEN_AMD_LOADER_LOADED_CODE_OBJECT_INFO_LOAD_BASE, + &data.allocate.ptr)); + + if (free_flag == 0) { + HSA_RT(LoaderApiTable.hsa_ven_amd_loader_loaded_code_object_get_info( + loaded_code_object, + HSA_VEN_AMD_LOADER_LOADED_CODE_OBJECT_INFO_LOAD_SIZE, + &data.allocate.size)); + } else { + data.allocate.size = 0; + } + + // Local GPU memory + // GLOBAL; FLAGS: COARSE GRAINED + data.allocate.segment = HSA_AMD_SEGMENT_GLOBAL; + data.allocate.global_flag = HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_COARSE_GRAINED; + data.allocate.is_code = 1; + + ISSUE_HSA_CALLBACK(ROCPROFILER_HSA_CB_ID_ALLOCATE); + + if (free_flag == 0) { + IS_HSA_CALLBACK(ROCPROFILER_HSA_CB_ID_DEVICE) { + hsa_amd_pointer_info_t pointer_info{}; + uint32_t num_agents = 0; + hsa_agent_t* agents = NULL; + pointer_info.size = sizeof(hsa_amd_pointer_info_t); + HSA_RT(hsa_amd_pointer_info( + const_cast(data.allocate.ptr), + &pointer_info, + malloc, + &num_agents, + &agents)); + + DeviceCallback(num_agents, agents, data.allocate.ptr); + } + } + + return HSA_STATUS_SUCCESS; + } + + static hsa_status_t ExecutableFreeze( + hsa_executable_t executable, + const char *options) + { + hsa_status_t status = HSA_STATUS_SUCCESS; + + HSA_RT(hsa_executable_freeze_fn(executable, options)); + + IS_HSA_CALLBACK(ROCPROFILER_HSA_CB_ID_ALLOCATE) { + LoaderApiTable.hsa_ven_amd_loader_executable_iterate_loaded_code_objects( + executable, + CodeObjectCallback, + reinterpret_cast(0)); + } + + return status; + } + + static hsa_status_t ExecutableDestroy( + hsa_executable_t executable) + { + hsa_status_t status = HSA_STATUS_SUCCESS; + + IS_HSA_CALLBACK(ROCPROFILER_HSA_CB_ID_ALLOCATE) { + LoaderApiTable.hsa_ven_amd_loader_executable_iterate_loaded_code_objects( + executable, + CodeObjectCallback, + reinterpret_cast(1)); + } + + HSA_RT(hsa_executable_destroy_fn(executable)); + + return status; + } + + static bool enable_; + static thread_local bool recursion_; + static hsa_ven_amd_loader_1_01_pfn_t LoaderApiTable; + static rocprofiler_hsa_callbacks_t callbacks_; + static arg_t arg_; + static mutex_t mutex_; +}; + +} // namespace rocprofiler + +#endif // _SRC_CORE_HSA_INTERCEPTOR_H diff --git a/src/core/hsa_proxy_queue.h b/src/core/hsa_proxy_queue.h index dd4999b9..3713bfac 100644 --- a/src/core/hsa_proxy_queue.h +++ b/src/core/hsa_proxy_queue.h @@ -30,6 +30,7 @@ THE SOFTWARE. #include "core/proxy_queue.h" #include "util/exception.h" +#include "util/hsa_rsrc_factory.h" namespace rocprofiler { extern decltype(hsa_queue_destroy)* hsa_queue_destroy_fn; diff --git a/src/core/intercept_queue.cpp b/src/core/intercept_queue.cpp index 91028f73..0b309d63 100644 --- a/src/core/intercept_queue.cpp +++ b/src/core/intercept_queue.cpp @@ -29,10 +29,9 @@ void InterceptQueue::HsaIntercept(HsaApiTable* table) { } InterceptQueue::mutex_t InterceptQueue::mutex_; -rocprofiler_callback_t InterceptQueue::dispatch_callback_ = NULL; -InterceptQueue::queue_callback_t InterceptQueue::create_callback_ = NULL; -InterceptQueue::queue_callback_t InterceptQueue::destroy_callback_ = NULL; +rocprofiler_queue_callbacks_t InterceptQueue::callbacks_ = {}; void* InterceptQueue::callback_data_ = NULL; +std::atomic InterceptQueue::dispatch_callback_{NULL}; InterceptQueue::obj_map_t* InterceptQueue::obj_map_ = NULL; const char* InterceptQueue::kernel_none_ = ""; Tracker* InterceptQueue::tracker_ = NULL; @@ -40,4 +39,7 @@ bool InterceptQueue::tracker_on_ = false; bool InterceptQueue::in_create_call_ = false; InterceptQueue::queue_id_t InterceptQueue::current_queue_id = 0; +rocprofiler_hsa_callback_fun_t InterceptQueue::submit_callback_fun_ = NULL; +void* InterceptQueue::submit_callback_arg_ = NULL; + } // namespace rocprofiler diff --git a/src/core/intercept_queue.h b/src/core/intercept_queue.h index f639b3e5..a52d8c1d 100644 --- a/src/core/intercept_queue.h +++ b/src/core/intercept_queue.h @@ -84,8 +84,8 @@ class InterceptQueue { obj->queue_id = current_queue_id; ++current_queue_id; - if (create_callback_ != NULL) { - status = create_callback_(*queue, callback_data_); + if (callbacks_.create != NULL) { + status = callbacks_.create(*queue, callback_data_); } in_create_call_ = false; @@ -112,8 +112,8 @@ class InterceptQueue { std::lock_guard lck(mutex_); hsa_status_t status = HSA_STATUS_SUCCESS; - if (destroy_callback_ != NULL) { - status = destroy_callback_(queue, callback_data_); + if (callbacks_.destroy != NULL) { + status = callbacks_.destroy(queue, callback_data_); } if (status == HSA_STATUS_SUCCESS) { @@ -129,13 +129,47 @@ class InterceptQueue { InterceptQueue* obj = reinterpret_cast(data); Queue* proxy = obj->proxy_; + if (submit_callback_fun_) { + mutex_.lock(); + auto* callback_fun = submit_callback_fun_; + void* callback_arg = submit_callback_arg_; + mutex_.unlock(); + + if (callback_fun) { + for (uint64_t j = 0; j < count; ++j) { + const packet_t* packet = &packets_arr[j]; + const hsa_kernel_dispatch_packet_t* dispatch_packet = + reinterpret_cast(packet); + + const char* kernel_name = NULL; + if (GetHeaderType(packet) == HSA_PACKET_TYPE_KERNEL_DISPATCH) { + uint64_t kernel_object = dispatch_packet->kernel_object; + const amd_kernel_code_t* kernel_code = GetKernelCode(kernel_object); + kernel_name = (GetHeaderType(packet) == HSA_PACKET_TYPE_KERNEL_DISPATCH) ? + QueryKernelName(kernel_object, kernel_code) : NULL; + } + + // Prepareing submit callback data + rocprofiler_hsa_callback_data_t data{}; + data.submit.packet = (void*)packet; + data.submit.kernel_name = kernel_name; + data.submit.queue = obj->queue_; + data.submit.device_type = obj->agent_info_->dev_type; + data.submit.device_id = obj->agent_info_->dev_index; + + callback_fun(ROCPROFILER_HSA_CB_ID_SUBMIT, &data, callback_arg); + } + } + } + // Travers input packets for (uint64_t j = 0; j < count; ++j) { const packet_t* packet = &packets_arr[j]; bool to_submit = true; // Checking for dispatch packet type - if ((GetHeaderType(packet) == HSA_PACKET_TYPE_KERNEL_DISPATCH) && (dispatch_callback_ != NULL)) { + if ((GetHeaderType(packet) == HSA_PACKET_TYPE_KERNEL_DISPATCH) && + (dispatch_callback_.load(std::memory_order_acquire) != NULL)) { const hsa_kernel_dispatch_packet_t* dispatch_packet = reinterpret_cast(packet); const hsa_signal_t completion_signal = dispatch_packet->completion_signal; @@ -150,17 +184,7 @@ class InterceptQueue { // Prepareing dispatch callback data uint64_t kernel_object = dispatch_packet->kernel_object; const amd_kernel_code_t* kernel_code = GetKernelCode(kernel_object); - - const uint16_t kernel_object_flag = *((uint64_t*)kernel_code + 1); - if (kernel_object_flag == 0) { - if (!util::HsaRsrcFactory::IsExecutableTracking()) { - fprintf(stderr, "Error: V3 code object detected - code objects tracking should be enabled\n"); - abort(); - } - } - const char* kernel_name = (util::HsaRsrcFactory::IsExecutableTracking()) ? - util::HsaRsrcFactory::GetKernelName(kernel_object) : - GetKernelName(kernel_code->runtime_loader_kernel_symbol); + const char* kernel_name = QueryKernelName(kernel_object, kernel_code); rocprofiler_callback_data_t data = {obj->agent_info_->dev_id, obj->agent_info_->dev_index, @@ -172,12 +196,12 @@ class InterceptQueue { kernel_name, kernel_object, kernel_code, - syscall(__NR_gettid), + (uint32_t)syscall(__NR_gettid), (tracker_entry) ? tracker_entry->record : NULL}; // Calling dispatch callback rocprofiler_group_t group = {}; - hsa_status_t status = dispatch_callback_(&data, callback_data_, &group); + hsa_status_t status = (dispatch_callback_.load())(&data, callback_data_, &group); free(const_cast(kernel_name)); // Injecting profiling start/stop packets if ((status != HSA_STATUS_SUCCESS) || (group.context == NULL)) { @@ -227,16 +251,29 @@ class InterceptQueue { } } - static void SetCallbacks(rocprofiler_callback_t dispatch_callback, - queue_callback_t create_callback, - queue_callback_t destroy_callback, - void* data) - { + static void SetCallbacks(rocprofiler_queue_callbacks_t callbacks, void* data) { std::lock_guard lck(mutex_); + if (callback_data_ != NULL) { + EXC_ABORT(HSA_STATUS_ERROR, "reassigning queue callbacks - not supported"); + } + callbacks_ = callbacks; callback_data_ = data; - dispatch_callback_ = dispatch_callback; - create_callback_ = create_callback; - destroy_callback_ = destroy_callback; + Start(); + } + + static void RemoveCallbacks() { + std::lock_guard lck(mutex_); + callbacks_ = {}; + Stop(); + } + + static inline void Start() { dispatch_callback_.store(callbacks_.dispatch, std::memory_order_release); } + static inline void Stop() { dispatch_callback_.store(NULL, std::memory_order_relaxed); } + + static void SetSubmitCallback(rocprofiler_hsa_callback_fun_t fun, void* arg) { + std::lock_guard lck(mutex_); + submit_callback_fun_ = fun; + submit_callback_arg_ = arg; } static void TrackerOn(bool on) { tracker_on_ = on; } @@ -269,20 +306,28 @@ class InterceptQueue { static const char* GetKernelName(const uint64_t kernel_symbol) { amd_runtime_loader_debug_info_t* dbg_info = reinterpret_cast(kernel_symbol); - const char* kernel_name = (dbg_info != NULL) ? dbg_info->kernel_name : NULL; - - // Kernel name is mangled name - // apply __cxa_demangle() to demangle it - const char* funcname = NULL; - if (kernel_name != NULL) { - size_t funcnamesize = 0; - int status; - const char* ret = abi::__cxa_demangle(kernel_name, NULL, &funcnamesize, &status); - funcname = (ret != 0) ? ret : strdup(kernel_name); - } - if (funcname == NULL) funcname = strdup(kernel_none_); + return (dbg_info != NULL) ? dbg_info->kernel_name : NULL; + } - return funcname; + // Demangle C++ symbol name + static const char* cpp_demangle(const char* symname) { + size_t size = 0; + int status; + const char* ret = abi::__cxa_demangle(symname, NULL, &size, &status); + return (ret != 0) ? ret : strdup(symname); + } + + static const char* QueryKernelName(uint64_t kernel_object, const amd_kernel_code_t* kernel_code) { + const uint16_t kernel_object_flag = *((uint64_t*)kernel_code + 1); + if (kernel_object_flag == 0) { + if (!util::HsaRsrcFactory::IsExecutableTracking()) { + EXC_ABORT(HSA_STATUS_ERROR, "Error: V3 code object detected - code objects tracking should be enabled\n"); + } + } + const char* kernel_symname = (util::HsaRsrcFactory::IsExecutableTracking()) ? + util::HsaRsrcFactory::GetKernelNameRef(kernel_object) : + GetKernelName(kernel_code->runtime_loader_kernel_symbol); + return cpp_demangle(kernel_symname); } // method to get an intercept queue object @@ -324,12 +369,13 @@ class InterceptQueue { ProxyQueue::Destroy(proxy_); } - static mutex_t mutex_; static const packet_word_t header_type_mask = (1ul << HSA_PACKET_HEADER_WIDTH_TYPE) - 1; - static rocprofiler_callback_t dispatch_callback_; - static queue_callback_t create_callback_; - static queue_callback_t destroy_callback_; + + static mutex_t mutex_; + static rocprofiler_queue_callbacks_t callbacks_; static void* callback_data_; + static std::atomic dispatch_callback_; + static obj_map_t* obj_map_; static const char* kernel_none_; static Tracker* tracker_; @@ -337,6 +383,9 @@ class InterceptQueue { static bool in_create_call_; static queue_id_t current_queue_id; + static rocprofiler_hsa_callback_fun_t submit_callback_fun_; + static void* submit_callback_arg_; + hsa_queue_t* const queue_; ProxyQueue* const proxy_; const util::AgentInfo* agent_info_; diff --git a/src/core/metrics.h b/src/core/metrics.h index f9ae1fbd..a221168a 100644 --- a/src/core/metrics.h +++ b/src/core/metrics.h @@ -277,7 +277,7 @@ class MetricsDict { std::cout << name << "=" << expr_obj->String() << "\n" << std::endl; #endif counters_vec_t counters_vec; - for (const auto& var : expr_obj->GetVars()) { + for (const std::string& var : expr_obj->GetVars()) { auto it = cache_.find(var); if (it == cache_.end()) { EXC_RAISING(HSA_STATUS_ERROR, "Bad metric '" << name << "', var '" << var << "' is not found"); diff --git a/src/core/proxy_queue.h b/src/core/proxy_queue.h index 42e6c63b..e719fed4 100644 --- a/src/core/proxy_queue.h +++ b/src/core/proxy_queue.h @@ -24,7 +24,7 @@ THE SOFTWARE. #define _SRC_CORE_PROXY_QUEUE_H #include -#include +#include #include #include #include diff --git a/src/core/rocprofiler.cpp b/src/core/rocprofiler.cpp index 3f1362a7..618edf23 100644 --- a/src/core/rocprofiler.cpp +++ b/src/core/rocprofiler.cpp @@ -23,13 +23,15 @@ THE SOFTWARE. #include "inc/rocprofiler.h" #include -#include #include + +#include #include #include "core/context.h" #include "core/context_pool.h" #include "core/hsa_queue.h" +#include "core/hsa_interceptor.h" #include "core/intercept_queue.h" #include "core/proxy_queue.h" #include "core/simple_proxy_queue.h" @@ -53,6 +55,15 @@ THE SOFTWARE. } \ return status; +#define ONLOAD_TRACE(str) \ + if (getenv("ROCP_ONLOAD_TRACE")) do { \ + std::cout << "PID(" << GetPid() << "): PROF_LIB::" << __FUNCTION__ << " " << str << std::endl << std::flush; \ + } while(0); +#define ONLOAD_TRACE_BEG() ONLOAD_TRACE("begin") +#define ONLOAD_TRACE_END() ONLOAD_TRACE("end") + +static inline uint32_t GetPid() { return syscall(__NR_getpid); } + /////////////////////////////////////////////////////////////////////////////////////////////////// // Internal library methods // @@ -84,8 +95,16 @@ decltype(hsa_queue_load_read_index_scacquire)* hsa_queue_load_read_index_scacqui decltype(hsa_amd_queue_intercept_create)* hsa_amd_queue_intercept_create_fn; decltype(hsa_amd_queue_intercept_register)* hsa_amd_queue_intercept_register_fn; +decltype(hsa_memory_allocate)* hsa_memory_allocate_fn; +decltype(hsa_memory_assign_agent)* hsa_memory_assign_agent_fn; +decltype(hsa_memory_copy)* hsa_memory_copy_fn; +decltype(hsa_amd_memory_pool_allocate)* hsa_amd_memory_pool_allocate_fn; +decltype(hsa_amd_memory_pool_free)* hsa_amd_memory_pool_free_fn; +decltype(hsa_amd_agents_allow_access)* hsa_amd_agents_allow_access_fn; decltype(hsa_amd_memory_async_copy)* hsa_amd_memory_async_copy_fn; decltype(hsa_amd_memory_async_copy_rect)* hsa_amd_memory_async_copy_rect_fn; +decltype(hsa_executable_freeze)* hsa_executable_freeze_fn; +decltype(hsa_executable_destroy)* hsa_executable_destroy_fn; ::HsaApiTable* kHsaApiTable; @@ -146,10 +165,14 @@ enum { DISPATCH_INTERCEPT_MODE = 0x1, CODE_OBJ_TRACKING_MODE = 0x2, MEMCOPY_INTERCEPT_MODE = 0x4, + HSA_INTERCEPT_MODE = 0x8, }; uint32_t LoadTool() { uint32_t intercept_mode = 0; const char* tool_lib = getenv("ROCP_TOOL_LIB"); + std::ostringstream oss; + if (tool_lib) oss << "load tool library(" << tool_lib << ")"; + ONLOAD_TRACE(oss.str()); if (tool_lib) { intercept_mode = DISPATCH_INTERCEPT_MODE; @@ -180,6 +203,7 @@ uint32_t LoadTool() { settings.trace_local = TraceProfile::IsLocal() ? 1: 0; settings.timeout = util::HsaRsrcFactory::GetTimeoutNs(); settings.timestamp_on = InterceptQueue::IsTrackerOn() ? 1 : 0; + settings.code_obj_tracking = 1; if (handler) handler(); else if (handler_prop) handler_prop(&settings); @@ -191,13 +215,16 @@ uint32_t LoadTool() { if (settings.intercept_mode != 0) intercept_mode = DISPATCH_INTERCEPT_MODE; if (settings.code_obj_tracking) intercept_mode |= CODE_OBJ_TRACKING_MODE; if (settings.memcopy_tracking) intercept_mode |= MEMCOPY_INTERCEPT_MODE; + if (settings.hsa_intercepting) intercept_mode |= HSA_INTERCEPT_MODE; } + ONLOAD_TRACE("end intercept_mode(" << intercept_mode << ")"); return intercept_mode; } // Unload profiling tool librray void UnloadTool() { + ONLOAD_TRACE("tool handle(" << tool_handle << ")"); if (tool_handle) { tool_handler_t handler = reinterpret_cast(dlsym(tool_handle, "OnUnloadTool")); if (handler == NULL) { @@ -208,16 +235,21 @@ void UnloadTool() { handler(); dlclose(tool_handle); } + ONLOAD_TRACE_END(); } CONSTRUCTOR_API void constructor() { + ONLOAD_TRACE_BEG(); util::Logger::Create(); + ONLOAD_TRACE_END(); } DESTRUCTOR_API void destructor() { + ONLOAD_TRACE_BEG(); rocprofiler::MetricsDict::Destroy(); util::HsaRsrcFactory::Destroy(); util::Logger::Destroy(); + ONLOAD_TRACE_END(); } const MetricsDict* GetMetrics(const hsa_agent_t& agent) { @@ -403,6 +435,7 @@ extern "C" { // HSA-runtime tool on-load method PUBLIC_API bool OnLoad(HsaApiTable* table, uint64_t runtime_version, uint64_t failed_tool_count, const char* const* failed_tool_names) { + ONLOAD_TRACE_BEG(); rocprofiler::SaveHsaApi(table); rocprofiler::ProxyQueue::InitFactory(); bool intercept_mode = false; @@ -449,6 +482,13 @@ PUBLIC_API bool OnLoad(HsaApiTable* table, uint64_t runtime_version, uint64_t fa table->amd_ext_->hsa_amd_memory_async_copy_fn = rocprofiler::hsa_amd_memory_async_copy_interceptor; table->amd_ext_->hsa_amd_memory_async_copy_rect_fn = rocprofiler::hsa_amd_memory_async_copy_rect_interceptor; } + if (intercept_mode_mask & rocprofiler::HSA_INTERCEPT_MODE) { + if (intercept_mode_mask & rocprofiler::MEMCOPY_INTERCEPT_MODE) { + EXC_ABORT(HSA_STATUS_ERROR, "HSA_INTERCEPT and MEMCOPY_INTERCEPT conflict"); + } + rocprofiler::HsaInterceptor::Enable(true); + rocprofiler::HsaInterceptor::HsaIntercept(table); + } // HSA intercepting if (intercept_mode) { @@ -458,14 +498,16 @@ PUBLIC_API bool OnLoad(HsaApiTable* table, uint64_t runtime_version, uint64_t fa rocprofiler::StandaloneIntercept(); } + ONLOAD_TRACE_END(); return true; } // HSA-runtime tool on-unload method PUBLIC_API void OnUnload() { - rocprofiler::Tracker::Destroy(); + ONLOAD_TRACE_BEG(); rocprofiler::UnloadTool(); rocprofiler::RestoreHsaApi(); + ONLOAD_TRACE_END(); } // Returns library vesrion @@ -527,6 +569,14 @@ PUBLIC_API hsa_status_t rocprofiler_reset(rocprofiler_t* handle, uint32_t group_ API_METHOD_SUFFIX } +// Return context agent +PUBLIC_API hsa_status_t rocprofiler_get_agent(rocprofiler_t* handle, hsa_agent_t* agent) { + API_METHOD_PREFIX + rocprofiler::Context* context = reinterpret_cast(handle); + *agent = context->GetAgent(); + API_METHOD_SUFFIX +} + // Get profiling group count PUBLIC_API hsa_status_t rocprofiler_group_count(const rocprofiler_t* handle, uint32_t* group_count) { @@ -617,14 +667,26 @@ PUBLIC_API hsa_status_t rocprofiler_get_metrics(const rocprofiler_t* handle) { // Set/remove queue callbacks PUBLIC_API hsa_status_t rocprofiler_set_queue_callbacks(rocprofiler_queue_callbacks_t callbacks, void* data) { API_METHOD_PREFIX - rocprofiler::InterceptQueue::SetCallbacks(callbacks.dispatch, callbacks.create, callbacks.destroy, data); + rocprofiler::InterceptQueue::SetCallbacks(callbacks, data); API_METHOD_SUFFIX } // Remove queue callbacks PUBLIC_API hsa_status_t rocprofiler_remove_queue_callbacks() { API_METHOD_PREFIX - rocprofiler::InterceptQueue::SetCallbacks(NULL, NULL, NULL, NULL); + rocprofiler::InterceptQueue::RemoveCallbacks(); + API_METHOD_SUFFIX +} + +// Start/stop queue callbacks +PUBLIC_API hsa_status_t rocprofiler_start_queue_callbacks() { + API_METHOD_PREFIX + rocprofiler::InterceptQueue::Start(); + API_METHOD_SUFFIX +} +PUBLIC_API hsa_status_t rocprofiler_stop_queue_callbacks() { + API_METHOD_PREFIX + rocprofiler::InterceptQueue::Stop(); API_METHOD_SUFFIX } @@ -785,7 +847,7 @@ PUBLIC_API hsa_status_t rocprofiler_iterate_info( uint32_t block_counters; profile.events = &(counters_vec[0]->event); status = rocprofiler::util::HsaRsrcFactory::Instance().AqlProfileApi()->hsa_ven_amd_aqlprofile_get_info( - &profile, HSA_VEN_AMD_AQLPROFILE_INFO_BLOCK_COUNTERS, &block_counters); + &profile, HSA_VEN_AMD_AQLPROFILE_INFO_BLOCK_COUNTERS, &block_counters); if (status != HSA_STATUS_SUCCESS) continue; info.metric.instances = query.instance_count; @@ -840,16 +902,47 @@ PUBLIC_API hsa_status_t rocprofiler_queue_create_profiled( void* data, uint32_t private_segment_size, uint32_t group_segment_size, hsa_queue_t** queue) { - return rocprofiler::InterceptQueue::QueueCreateTracked(agent, size, type, callback, data, private_segment_size, group_segment_size, queue); + API_METHOD_PREFIX + status = rocprofiler::InterceptQueue::QueueCreateTracked( + agent, size, type, callback, data, private_segment_size, group_segment_size, queue); + API_METHOD_SUFFIX } // Return time value for a given time ID and profiling timestamp -hsa_status_t rocprofiler_get_time( +PUBLIC_API hsa_status_t rocprofiler_get_time( rocprofiler_time_id_t time_id, uint64_t timestamp, - uint64_t* value_ns) + uint64_t* value_ns, + uint64_t* error_ns) { - return rocprofiler::util::HsaRsrcFactory::Instance().GetTime(time_id, timestamp, value_ns); + API_METHOD_PREFIX + if (error_ns != NULL) { + *error_ns = 0; + status = rocprofiler::util::HsaRsrcFactory::Instance().GetTimeErr(time_id, error_ns); + } + if ((status == HSA_STATUS_SUCCESS) && (value_ns != NULL)) { + *value_ns = 0; + status = rocprofiler::util::HsaRsrcFactory::Instance().GetTimeVal(time_id, timestamp, value_ns); + } + API_METHOD_SUFFIX } } // extern "C" + +/////////////////////////////////////////////////////////////////////////////////////////////////// +// HSA API callbacks routines +// +bool rocprofiler::HsaInterceptor::enable_ = false; +thread_local bool rocprofiler::HsaInterceptor::recursion_ = false;; +rocprofiler_hsa_callbacks_t rocprofiler::HsaInterceptor::callbacks_{}; +rocprofiler::HsaInterceptor::arg_t rocprofiler::HsaInterceptor::arg_{}; +hsa_ven_amd_loader_1_01_pfn_t rocprofiler::HsaInterceptor::LoaderApiTable{}; +rocprofiler::HsaInterceptor::mutex_t rocprofiler::HsaInterceptor::mutex_; + +// Set HSA callbacks. If a callback is NULL then it is disabled +extern "C" PUBLIC_API hsa_status_t rocprofiler_set_hsa_callbacks(const rocprofiler_hsa_callbacks_t callbacks, void* arg) { + API_METHOD_PREFIX + rocprofiler::HsaInterceptor::SetCallbacks(callbacks, arg); + rocprofiler::InterceptQueue::SetSubmitCallback(callbacks.submit, arg); + API_METHOD_SUFFIX +} diff --git a/src/core/tracker.h b/src/core/tracker.h index e366c761..823dc17d 100644 --- a/src/core/tracker.h +++ b/src/core/tracker.h @@ -138,7 +138,7 @@ class Tracker { // Debug trace if (trace_on_) { auto outstanding = outstanding_.fetch_add(1); - fprintf(stdout, "Tracker::Add: entry %p, record %p, outst %lu\n", entry, entry->record, outstanding); + fprintf(stdout, "Tracker::Enable: entry %p, record %p, outst %lu\n", entry, entry->record, outstanding); fflush(stdout); } } @@ -163,11 +163,21 @@ class Tracker { {} ~Tracker() { + if (trace_on_) { + fprintf(stdout, "Tracker::DESTR: sig list %d, outst %lu\n", (int)(sig_list_.size()), outstanding_.load()); + fflush(stdout); + } + auto it = sig_list_.begin(); auto end = sig_list_.end(); while (it != end) { auto cur = it++; - hsa_rsrc_->SignalWait((*cur)->signal); +// The wait should be optiona as there possible some inter kernel dependencies and it possible to wait for +// the kernels will never be lunched as the application was finished by some reason. +#if 0 + // FIXME: currently the signal value for tracking signals are taken from original application signal + hsa_rsrc_->SignalWait((*cur)->signal, 1); +#endif Erase(cur); } } @@ -182,7 +192,7 @@ class Tracker { // Debug trace if (trace_on_) { auto outstanding = outstanding_.fetch_sub(1); - fprintf(stdout, "Tracker::Handler: entry %p, record %p, outst %lu\n", entry, entry->record, outstanding); + fprintf(stdout, "Tracker::Complete: entry %p, record %p, outst %lu\n", entry, entry->record, outstanding); fflush(stdout); } diff --git a/src/util/hsa_rsrc_factory.cpp b/src/util/hsa_rsrc_factory.cpp index 9ce362d4..78833284 100644 --- a/src/util/hsa_rsrc_factory.cpp +++ b/src/util/hsa_rsrc_factory.cpp @@ -44,9 +44,6 @@ POSSIBILITY OF SUCH DAMAGE. #include #include -#include "util/exception.h" -#include "util/logger.h" - namespace rocprofiler { namespace util { @@ -152,11 +149,15 @@ HsaRsrcFactory::HsaRsrcFactory(bool initialize_hsa) : initialize_hsa_(initialize // Time correlation const uint32_t corr_iters = 1000; - CorrelateTime(HsaTimer::TIME_ID_CLOCK_REALTIME, corr_iters); - CorrelateTime(HsaTimer::TIME_ID_CLOCK_MONOTONIC, corr_iters); + for (unsigned time_id = 0; time_id < HsaTimer::TIME_ID_NUMBER; time_id += 1) { + CorrelateTime((HsaTimer::time_id_t)time_id, corr_iters); + } // System timeout timeout_ = (timeout_ns_ == HsaTimer::TIMESTAMP_MAX) ? timeout_ns_ : timer_->ns_to_sysclock(timeout_ns_); + + // To dump code objects + to_dump_code_obj_ = getenv("ROCP_DUMP_CODEOBJ"); } // Destructor of the class @@ -197,6 +198,7 @@ void HsaRsrcFactory::InitHsaApiTable(HsaApiTable* table) { hsa_api_.hsa_executable_create_alt = table->core_->hsa_executable_create_alt_fn; hsa_api_.hsa_executable_load_agent_code_object = table->core_->hsa_executable_load_agent_code_object_fn; hsa_api_.hsa_executable_freeze = table->core_->hsa_executable_freeze_fn; + hsa_api_.hsa_executable_destroy = table->core_->hsa_executable_destroy_fn; hsa_api_.hsa_executable_get_symbol = table->core_->hsa_executable_get_symbol_fn; hsa_api_.hsa_executable_symbol_get_info = table->core_->hsa_executable_symbol_get_info_fn; hsa_api_.hsa_executable_iterate_symbols = table->core_->hsa_executable_iterate_symbols_fn; @@ -237,6 +239,7 @@ void HsaRsrcFactory::InitHsaApiTable(HsaApiTable* table) { hsa_api_.hsa_executable_create_alt = hsa_executable_create_alt; hsa_api_.hsa_executable_load_agent_code_object = hsa_executable_load_agent_code_object; hsa_api_.hsa_executable_freeze = hsa_executable_freeze; + hsa_api_.hsa_executable_destroy = hsa_executable_destroy; hsa_api_.hsa_executable_get_symbol = hsa_executable_get_symbol; hsa_api_.hsa_executable_symbol_get_info = hsa_executable_symbol_get_info; hsa_api_.hsa_executable_iterate_symbols = hsa_executable_iterate_symbols; @@ -523,22 +526,25 @@ uint8_t* HsaRsrcFactory::AllocateCmdMemory(const AgentInfo* agent_info, size_t s } // Wait signal -void HsaRsrcFactory::SignalWait(const hsa_signal_t& signal) const { +hsa_signal_value_t HsaRsrcFactory::SignalWait(const hsa_signal_t& signal, const hsa_signal_value_t& signal_value) const { + const hsa_signal_value_t exp_value = signal_value - 1; + hsa_signal_value_t ret_value = signal_value; while (1) { - const hsa_signal_value_t signal_value = - hsa_api_.hsa_signal_wait_scacquire(signal, HSA_SIGNAL_CONDITION_LT, 1, timeout_, HSA_WAIT_STATE_BLOCKED); - if (signal_value == 0) { - break; - } else { - if (signal_value == 1) WARN_LOGGING("signal waiting..."); - else EXC_RAISING(HSA_STATUS_ERROR, "hsa_signal_wait_scacquire (" << signal_value << ")"); + ret_value = + hsa_api_.hsa_signal_wait_scacquire(signal, HSA_SIGNAL_CONDITION_LT, signal_value, timeout_, HSA_WAIT_STATE_BLOCKED); + if (ret_value == exp_value) break; + if (ret_value != signal_value) { + std::cerr << "Error: HsaRsrcFactory::SignalWait: signal_value(" << signal_value + << "), ret_value(" << ret_value << ")" << std::endl << std::flush; + abort(); } } + return ret_value; } // Wait signal with signal value restore void HsaRsrcFactory::SignalWaitRestore(const hsa_signal_t& signal, const hsa_signal_value_t& signal_value) const { - SignalWait(signal); + SignalWait(signal, signal_value); hsa_api_.hsa_signal_store_relaxed(const_cast(signal), signal_value); } @@ -551,7 +557,7 @@ bool HsaRsrcFactory::Memcpy(const hsa_agent_t& agent, void* dst, const void* src CHECK_STATUS("hsa_signal_create()", status); status = hsa_api_.hsa_amd_memory_async_copy(dst, cpu_agents_[0], src, agent, size, 0, NULL, s); CHECK_STATUS("hsa_amd_memory_async_copy()", status); - SignalWait(s); + SignalWait(s, 1); status = hsa_api_.hsa_signal_destroy(s); CHECK_STATUS("hsa_signal_destroy()", status); } @@ -695,20 +701,21 @@ uint64_t HsaRsrcFactory::Submit(hsa_queue_t* queue, const void* packet, size_t s return write_idx; } -const char* HsaRsrcFactory::GetKernelName(uint64_t addr) { +const char* HsaRsrcFactory::GetKernelNameRef(uint64_t addr) { std::lock_guard lck(mutex_); const auto it = symbols_map_->find(addr); if (it == symbols_map_->end()) { fprintf(stderr, "HsaRsrcFactory::kernel addr (0x%lx) is not found\n", addr); abort(); } - return strdup(it->second); + return it->second; } void HsaRsrcFactory::EnableExecutableTracking(HsaApiTable* table) { std::lock_guard lck(mutex_); executable_tracking_on_ = true; table->core_->hsa_executable_freeze_fn = hsa_executable_freeze_interceptor; + table->core_->hsa_executable_destroy_fn = hsa_executable_destroy_interceptor; } hsa_status_t HsaRsrcFactory::executable_symbols_cb(hsa_executable_t exec, hsa_executable_symbol_t symbol, void *data) { @@ -726,10 +733,14 @@ hsa_status_t HsaRsrcFactory::executable_symbols_cb(hsa_executable_t exec, hsa_ex status = hsa_api_.hsa_executable_symbol_get_info(symbol, HSA_EXECUTABLE_SYMBOL_INFO_NAME, name); CHECK_STATUS("Error in getting kernel name", status); name[len] = 0; - auto ret = symbols_map_->insert({addr, name}); - if (ret.second == false) { - delete[] ret.first->second; - ret.first->second = name; + if (data == NULL) { + auto ret = symbols_map_->insert({addr, name}); + if (ret.second == false) { + delete[] ret.first->second; + ret.first->second = name; + } + } else { + symbols_map_->erase(addr); } } return HSA_STATUS_SUCCESS; @@ -740,7 +751,16 @@ hsa_status_t HsaRsrcFactory::hsa_executable_freeze_interceptor(hsa_executable_t if (symbols_map_ == NULL) symbols_map_ = new symbols_map_t; hsa_status_t status = hsa_api_.hsa_executable_iterate_symbols(executable, executable_symbols_cb, NULL); CHECK_STATUS("Error in iterating executable symbols", status); - return hsa_api_.hsa_executable_freeze(executable, options);; + return hsa_api_.hsa_executable_freeze(executable, options); +} + +hsa_status_t HsaRsrcFactory::hsa_executable_destroy_interceptor(hsa_executable_t executable) { + std::lock_guard lck(mutex_); + if (symbols_map_ != NULL) { + hsa_status_t status = hsa_api_.hsa_executable_iterate_symbols(executable, executable_symbols_cb, (void*)1); + CHECK_STATUS("Error in iterating executable symbols", status); + } + return hsa_api_.hsa_executable_destroy(executable); } std::atomic HsaRsrcFactory::instance_{}; @@ -749,6 +769,7 @@ HsaRsrcFactory::timestamp_t HsaRsrcFactory::timeout_ns_ = HsaTimer::TIMESTAMP_MA hsa_pfn_t HsaRsrcFactory::hsa_api_{}; bool HsaRsrcFactory::executable_tracking_on_ = false; HsaRsrcFactory::symbols_map_t* HsaRsrcFactory::symbols_map_ = NULL; +void* HsaRsrcFactory::to_dump_code_obj_ = NULL; } // namespace util } // namespace rocprofiler diff --git a/src/util/hsa_rsrc_factory.h b/src/util/hsa_rsrc_factory.h index 0362bc2c..a8e392aa 100644 --- a/src/util/hsa_rsrc_factory.h +++ b/src/util/hsa_rsrc_factory.h @@ -25,6 +25,8 @@ POSSIBILITY OF SUCH DAMAGE. #ifndef SRC_UTIL_HSA_RSRC_FACTORY_H_ #define SRC_UTIL_HSA_RSRC_FACTORY_H_ +#define AMD_INTERNAL_BUILD + #include #include #include @@ -95,6 +97,7 @@ struct hsa_pfn_t { decltype(hsa_executable_create_alt)* hsa_executable_create_alt; decltype(hsa_executable_load_agent_code_object)* hsa_executable_load_agent_code_object; decltype(hsa_executable_freeze)* hsa_executable_freeze; + decltype(hsa_executable_destroy)* hsa_executable_destroy; decltype(hsa_executable_get_symbol)* hsa_executable_get_symbol; decltype(hsa_executable_symbol_get_info)* hsa_executable_symbol_get_info; decltype(hsa_executable_iterate_symbols)* hsa_executable_iterate_symbols; @@ -164,10 +167,11 @@ struct AgentInfo { // Number of Shader Arrays Per Shader Engines in Gpu uint32_t shader_arrays_per_se; - // SGPR/VGPR block sizes + // SGPR/VGPR/LDS block sizes uint32_t sgpr_block_dflt; uint32_t sgpr_block_size; uint32_t vgpr_block_size; + static const uint32_t lds_block_size = 128 * 4; }; // HSA timer class @@ -180,7 +184,10 @@ class HsaTimer { enum time_id_t { TIME_ID_CLOCK_REALTIME = 0, - TIME_ID_CLOCK_MONOTONIC = 1, + TIME_ID_CLOCK_REALTIME_COARSE = 1, + TIME_ID_CLOCK_MONOTONIC = 2, + TIME_ID_CLOCK_MONOTONIC_COARSE = 3, + TIME_ID_CLOCK_MONOTONIC_RAW = 4, TIME_ID_NUMBER }; @@ -200,7 +207,7 @@ class HsaTimer { } // Method for timespec/ns conversion - timestamp_t timespec_to_ns(const timespec& time) const { + static timestamp_t timespec_to_ns(const timespec& time) { return ((timestamp_t)time.tv_sec * 1000000000) + time.tv_nsec; } @@ -224,13 +231,22 @@ class HsaTimer { void correlated_pair_ns(time_id_t time_id, uint32_t iters, timestamp_t* timestamp_v, timestamp_t* time_v, timestamp_t* error_v) { clockid_t clock_id = 0; - switch (clock_id) { + switch (time_id) { case TIME_ID_CLOCK_REALTIME: clock_id = CLOCK_REALTIME; break; + case TIME_ID_CLOCK_REALTIME_COARSE: + clock_id = CLOCK_REALTIME_COARSE; + break; case TIME_ID_CLOCK_MONOTONIC: clock_id = CLOCK_MONOTONIC; break; + case TIME_ID_CLOCK_MONOTONIC_COARSE: + clock_id = CLOCK_MONOTONIC_COARSE; + break; + case TIME_ID_CLOCK_MONOTONIC_RAW: + clock_id = CLOCK_MONOTONIC_RAW; + break; default: CHECK_STATUS("internal error: invalid time_id", HSA_STATUS_ERROR); } @@ -361,7 +377,7 @@ class HsaRsrcFactory { uint8_t* AllocateCmdMemory(const AgentInfo* agent_info, size_t size); // Wait signal - void SignalWait(const hsa_signal_t& signal) const; + hsa_signal_value_t SignalWait(const hsa_signal_t& signal, const hsa_signal_value_t& signal_value) const; // Wait signal with signal value restore void SignalWaitRestore(const hsa_signal_t& signal, const hsa_signal_value_t& signal_value) const; @@ -393,7 +409,7 @@ class HsaRsrcFactory { // Enable executables loading tracking static bool IsExecutableTracking() { return executable_tracking_on_; } static void EnableExecutableTracking(HsaApiTable* table); - static const char* GetKernelName(uint64_t addr); + static const char* GetKernelNameRef(uint64_t addr); // Initialize HSA API table void static InitHsaApiTable(HsaApiTable* table); @@ -428,9 +444,14 @@ class HsaRsrcFactory { time_error_[time_id] = error_v; } - hsa_status_t GetTime(uint32_t time_id, uint64_t value, uint64_t* time) { + hsa_status_t GetTimeVal(uint32_t time_id, uint64_t time_stamp, uint64_t* time_value) { if (time_id >= HsaTimer::TIME_ID_NUMBER) return HSA_STATUS_ERROR; - *time = value + time_shift_[time_id]; + *time_value = time_stamp + time_shift_[time_id]; + return HSA_STATUS_SUCCESS; + } + + hsa_status_t GetTimeErr(uint32_t time_id, uint64_t* err) { + *err = time_error_[time_id]; return HSA_STATUS_SUCCESS; } @@ -478,7 +499,9 @@ class HsaRsrcFactory { typedef std::map symbols_map_t; static symbols_map_t* symbols_map_; static bool executable_tracking_on_; + static void* to_dump_code_obj_; static hsa_status_t hsa_executable_freeze_interceptor(hsa_executable_t executable, const char *options); + static hsa_status_t hsa_executable_destroy_interceptor(hsa_executable_t executable); static hsa_status_t executable_symbols_cb(hsa_executable_t exec, hsa_executable_symbol_t symbol, void *data); // HSA runtime API table diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index 1ae8a554..4b3aec02 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -40,15 +40,22 @@ target_include_directories ( "c_test" PRIVATE ${ROOT_DIR} ${HSA_RUNTIME_INC_PATH file( GLOB UTIL_SRC "${TEST_DIR}/util/*.cpp" ) ## Standalone test sources -set ( STEXE_NAME "standalone_test" ) -set ( STTST_SRC +set ( ST_EXE_NAME "standalone_test" ) +set ( ST_TST_SRC ${TEST_DIR}/app/standalone_test.cpp ${TEST_DIR}/ctrl/test_hsa.cpp ) +## Standalone intercept test sources +set ( STIN_EXE_NAME "stand_intercept_test" ) +set ( STIN_TST_SRC + ${TEST_DIR}/app/stand_intercept_test.cpp + ${TEST_DIR}/ctrl/test_hsa.cpp +) + ## Intercept test sources -set ( INEXE_NAME "intercept_test" ) -set ( INTST_SRC +set ( IN_EXE_NAME "intercept_test" ) +set ( IN_TST_SRC ${TEST_DIR}/app/intercept_test.cpp ${TEST_DIR}/ctrl/test_hsa.cpp ) @@ -61,26 +68,34 @@ set ( CTRL_SRC ## Dummy kernel set ( DUMMY_NAME dummy_kernel ) -execute_process ( COMMAND sh -xc "${TEST_DIR}/../bin/build_kernel.sh ${TEST_DIR}/${DUMMY_NAME}/${DUMMY_NAME} ${PROJECT_BINARY_DIR}" ) - ## Test kernel set ( TEST_NAME simple_convolution ) set ( KERN_SRC ${TEST_DIR}/${TEST_NAME}/${TEST_NAME}.cpp ) -execute_process ( COMMAND sh -xc "${TEST_DIR}/../bin/build_kernel.sh ${TEST_DIR}/${TEST_NAME}/${TEST_NAME} ${PROJECT_BINARY_DIR}" ) + +## Building test kernels +add_custom_target( mytest + COMMAND sh -xc "${TEST_DIR}/../bin/build_kernel.sh ${TEST_DIR}/${DUMMY_NAME}/${DUMMY_NAME} ${PROJECT_BINARY_DIR} '${ROCM_ROOT_DIR}' '${GPU_TARGETS}'" + COMMAND sh -xc "${TEST_DIR}/../bin/build_kernel.sh ${TEST_DIR}/${TEST_NAME}/${TEST_NAME} ${PROJECT_BINARY_DIR} '${ROCM_ROOT_DIR}' '${GPU_TARGETS}'" +) ## Building standalone test executable -add_executable ( ${STEXE_NAME} ${STTST_SRC} ${UTIL_SRC} ${KERN_SRC} ) -target_include_directories ( ${STEXE_NAME} PRIVATE ${TEST_DIR} ${ROOT_DIR} ${HSA_RUNTIME_INC_PATH} ) -target_link_libraries ( ${STEXE_NAME} ${ROCPROFILER_TARGET} ${HSA_RUNTIME_LIB} ${HSA_KMT_LIB} c stdc++ dl pthread rt ) +add_executable ( ${ST_EXE_NAME} ${ST_TST_SRC} ${UTIL_SRC} ${KERN_SRC} ) +target_include_directories ( ${ST_EXE_NAME} PRIVATE ${TEST_DIR} ${ROOT_DIR} ${HSA_RUNTIME_INC_PATH} ) +target_link_libraries ( ${ST_EXE_NAME} ${ROCPROFILER_TARGET} ${HSA_RUNTIME_LIB} ${HSA_KMT_LIB} c stdc++ dl pthread rt ) + +## Building standalone intercept test executable +add_executable ( ${STIN_EXE_NAME} ${STIN_TST_SRC} ${UTIL_SRC} ${KERN_SRC} ) +target_include_directories ( ${STIN_EXE_NAME} PRIVATE ${TEST_DIR} ${ROOT_DIR} ${HSA_RUNTIME_INC_PATH} ) +target_link_libraries ( ${STIN_EXE_NAME} ${ROCPROFILER_TARGET} ${HSA_RUNTIME_LIB} ${HSA_KMT_LIB} c stdc++ dl pthread rt ) ## Building intercept test executable -add_library ( ${INEXE_NAME} SHARED ${INTST_SRC} ${UTIL_SRC} ${KERN_SRC} ) -target_include_directories ( ${INEXE_NAME} PRIVATE ${TEST_DIR} ${ROOT_DIR} ${HSA_RUNTIME_INC_PATH} ) -target_link_libraries ( ${INEXE_NAME} ${ROCPROFILER_TARGET} ${HSA_RUNTIME_LIB} ${HSA_KMT_LIB} c stdc++ dl pthread rt ) +add_library ( ${IN_EXE_NAME} SHARED ${IN_TST_SRC} ${UTIL_SRC} ${KERN_SRC} ) +target_include_directories ( ${IN_EXE_NAME} PRIVATE ${TEST_DIR} ${ROOT_DIR} ${HSA_RUNTIME_INC_PATH} ) +target_link_libraries ( ${IN_EXE_NAME} ${ROCPROFILER_TARGET} ${HSA_RUNTIME_LIB} ${HSA_KMT_LIB} c stdc++ dl pthread rt ) ## Building ctrl test executable add_executable ( ${EXE_NAME} ${CTRL_SRC} ${UTIL_SRC} ${KERN_SRC} ) -target_include_directories ( ${EXE_NAME} PRIVATE ${TEST_DIR} ${ROOT_DIR} ${HSA_RUNTIME_INC_PATH} ) +target_include_directories ( ${EXE_NAME} PRIVATE ${TEST_DIR} ${ROOT_DIR} ${HSA_RUNTIME_INC_PATH} ${HSA_RUNTIME_LIB_PATH}/../include ) target_link_libraries ( ${EXE_NAME} ${HSA_RUNTIME_LIB} ${HSA_KMT_LIB} c stdc++ dl pthread rt ) execute_process ( COMMAND sh -xc "cp ${TEST_DIR}/run.sh ${PROJECT_BINARY_DIR}" ) execute_process ( COMMAND sh -xc "cp ${TEST_DIR}/tool/*.xml ${PROJECT_BINARY_DIR}" ) @@ -93,5 +108,11 @@ add_library ( ${TEST_LIB} SHARED ${TEST_LIB_SRC} ) target_include_directories ( ${TEST_LIB} PRIVATE ${TEST_DIR} ${ROOT_DIR} ${HSA_RUNTIME_INC_PATH} ) target_link_libraries ( ${TEST_LIB} ${ROCPROFILER_TARGET} ${HSA_RUNTIME_LIB} c stdc++ dl pthread rt ) +## Build memory test bench +add_custom_target( mbench + COMMAND sh -xc "cp -r ${TEST_DIR}/memory_validation ${PROJECT_BINARY_DIR}/test/." + COMMAND make -C "${PROJECT_BINARY_DIR}/test/memory_validation" +) + ## Copy OCL test -execute_process ( COMMAND sh -xc "cp -r ${TEST_DIR}/ocl ${PROJECT_BINARY_DIR}/test" ) +execute_process ( COMMAND sh -xc "cp -r ${TEST_DIR}/ocl ${PROJECT_BINARY_DIR}/test/." ) diff --git a/test/app/intercept_test.cpp b/test/app/intercept_test.cpp index c2905d1e..e62bf6ce 100644 --- a/test/app/intercept_test.cpp +++ b/test/app/intercept_test.cpp @@ -91,7 +91,7 @@ void dump_context_entry(context_entry_t* entry, rocprofiler_feature_t* features, const rocprofiler_dispatch_record_t* record = entry->data.record; fflush(stdout); - fprintf(stdout, "kernel symbol(0x%lx) name(\"%s\") tid(%ld) queue-id(%u) gpu-id(%u) ", + fprintf(stdout, "kernel symbol(0x%lx) name(\"%s\") tid(%u) queue-id(%u) gpu-id(%u) ", entry->data.kernel_object, kernel_name.c_str(), entry->data.thread_id, diff --git a/test/app/stand_intercept_test.cpp b/test/app/stand_intercept_test.cpp new file mode 100644 index 00000000..97642557 --- /dev/null +++ b/test/app/stand_intercept_test.cpp @@ -0,0 +1,190 @@ +/****************************************************************************** +Copyright (c) 2018 Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*******************************************************************************/ + +#include +#include +#include +#include +#include +#include + +#include "ctrl/run_kernel.h" +#include "ctrl/test_aql.h" +#include "ctrl/test_hsa.h" +#include "inc/rocprofiler.h" +#include "dummy_kernel/dummy_kernel.h" +#include "simple_convolution/simple_convolution.h" +#include "util/test_assert.h" + +// Dispatch callbacks and context handlers synchronization +pthread_mutex_t mutex = PTHREAD_RECURSIVE_MUTEX_INITIALIZER_NP; + +// Error handler +void fatal(const std::string msg) { + fflush(stdout); + fprintf(stderr, "%s\n\n", msg.c_str()); + fflush(stderr); + abort(); +} + +// Check returned HSA API status +void check_status(hsa_status_t status) { + if (status != HSA_STATUS_SUCCESS) { + const char* error_string = NULL; + rocprofiler_error_string(&error_string); + fprintf(stderr, "ERROR: %s\n", error_string); + abort(); + } +} + +// Context stored entry type +struct context_entry_t { + bool valid; + hsa_agent_t agent; + rocprofiler_group_t group; + rocprofiler_callback_data_t data; +}; + +// Dump stored context entry +void dump_context_entry(context_entry_t* entry) { + volatile std::atomic* valid = reinterpret_cast*>(&entry->valid); + while (valid->load() == false) sched_yield(); + + const std::string kernel_name = entry->data.kernel_name; + const rocprofiler_dispatch_record_t* record = entry->data.record; + + fflush(stdout); + fprintf(stdout, "kernel-object(0x%lx) name(\"%s\")", entry->data.kernel_object, kernel_name.c_str()); + if (record) fprintf(stdout, ", gpu-id(%u), time(%lu,%lu,%lu,%lu)", + HsaRsrcFactory::Instance().GetAgentInfo(entry->agent)->dev_index, + record->dispatch, + record->begin, + record->end, + record->complete); + fprintf(stdout, "\n"); + fflush(stdout); + + rocprofiler_group_t& group = entry->group; + if (group.context == NULL) { + fprintf(stderr, "tool error: context is NULL\n"); + abort(); + } + + rocprofiler_close(group.context); +} + +// Profiling completion handler +// Dump and delete the context entry +// Return true if the context was dumped successfully +bool context_handler(rocprofiler_group_t group, void* arg) { + context_entry_t* entry = reinterpret_cast(arg); + + if (pthread_mutex_lock(&mutex) != 0) { + perror("pthread_mutex_lock"); + abort(); + } + + dump_context_entry(entry); + delete entry; + + if (pthread_mutex_unlock(&mutex) != 0) { + perror("pthread_mutex_unlock"); + abort(); + } + + return false; +} + +// Kernel disoatch callback +hsa_status_t dispatch_callback(const rocprofiler_callback_data_t* callback_data, void* /*user_data*/, + rocprofiler_group_t* group) { + // HSA status + hsa_status_t status = HSA_STATUS_ERROR; + + // Profiling context + rocprofiler_t* context = NULL; + + // Context entry + context_entry_t* entry = new context_entry_t(); + + // context properties + rocprofiler_properties_t properties{}; + properties.handler = context_handler; + properties.handler_arg = (void*)entry; + + // Open profiling context + status = rocprofiler_open(callback_data->agent, NULL, 0, + &context, 0 /*ROCPROFILER_MODE_SINGLEGROUP*/, &properties); + check_status(status); + + // Get group[0] + status = rocprofiler_get_group(context, 0, group); + check_status(status); + + // Fill profiling context entry + entry->agent = callback_data->agent; + entry->group = *group; + entry->data = *callback_data; + entry->data.kernel_name = strdup(callback_data->kernel_name); + reinterpret_cast*>(&entry->valid)->store(true); + + return HSA_STATUS_SUCCESS; +} + +int main() { + bool ret_val = true; + const char* kiter_s = getenv("ROCP_KITER"); + const char* diter_s = getenv("ROCP_DITER"); + const unsigned kiter = (kiter_s != NULL) ? atol(kiter_s) : 1; + const unsigned diter = (diter_s != NULL) ? atol(diter_s) : 1; + + // Adding dispatch observer + rocprofiler_queue_callbacks_t callbacks_ptrs{}; + callbacks_ptrs.dispatch = dispatch_callback; + rocprofiler_set_queue_callbacks(callbacks_ptrs, NULL); + + // Instantiate HSA resources + HsaRsrcFactory::Create(); + + // Getting GPU device info + const AgentInfo* agent_info = NULL; + if (HsaRsrcFactory::Instance().GetGpuAgentInfo(0, &agent_info) == false) abort(); + + // Creating the queue + hsa_queue_t* queue = NULL; + if (HsaRsrcFactory::Instance().CreateQueue(agent_info, 128, &queue) == false) abort(); + + // Test initialization + TestHsa::HsaInstantiate(); + + for (unsigned ind = 0; ind < kiter; ++ind) { + printf("Iteration %u:\n", ind); + if ((ind & 1) == 0) rocprofiler_start_queue_callbacks(); + else rocprofiler_stop_queue_callbacks(); + ret_val = RunKernel(0, NULL, agent_info, queue, diter); + if (ret_val) ret_val = RunKernel(0, NULL, agent_info, queue, diter); + } + + TestHsa::HsaShutdown(); + + return (ret_val) ? 0 : 1; +} diff --git a/test/app/standalone_test.cpp b/test/app/standalone_test.cpp index b173c4d3..34bc05ea 100644 --- a/test/app/standalone_test.cpp +++ b/test/app/standalone_test.cpp @@ -31,8 +31,46 @@ THE SOFTWARE. #include "inc/rocprofiler.h" #include "dummy_kernel/dummy_kernel.h" #include "simple_convolution/simple_convolution.h" +#include "util/hsa_rsrc_factory.h" #include "util/test_assert.h" +// print time +void print_sys_time(clockid_t clock_id, rocprofiler_time_id_t time_id) { + HsaTimer::timestamp_t value_ns = 0; + HsaTimer::timestamp_t error_ns = 0; + HsaTimer::timestamp_t timestamp = 0; + + timespec tm_val; + clock_gettime(clock_id, &tm_val); + HsaTimer::timestamp_t tm_val_ns = HsaTimer::timespec_to_ns(tm_val); + + timestamp = HsaRsrcFactory::Instance().TimestampNs(); + hsa_status_t status = rocprofiler_get_time(time_id, timestamp, &value_ns, &error_ns); + TEST_STATUS(status == HSA_STATUS_SUCCESS); + + HsaTimer::timestamp_t timestamp1 = timestamp; + HsaTimer::timestamp_t value_ns1 = value_ns; + + printf("time-id(%d) ts_ns(%lu) orig_ns(%lu) time_ns(%lu) err_ns(%lu)\n", (int)time_id, timestamp, tm_val_ns, value_ns, error_ns); + + sleep(1); + + timestamp = HsaRsrcFactory::Instance().TimestampNs(); + status = rocprofiler_get_time(time_id, timestamp, &value_ns, NULL); + TEST_STATUS(status == HSA_STATUS_SUCCESS); + status = rocprofiler_get_time(time_id, timestamp, NULL, &error_ns); + TEST_STATUS(status == HSA_STATUS_SUCCESS); + status = rocprofiler_get_time(time_id, timestamp, NULL, NULL); + TEST_STATUS(status == HSA_STATUS_SUCCESS); + + HsaTimer::timestamp_t timestamp2 = timestamp; + HsaTimer::timestamp_t value_ns2 = value_ns; + + printf("time-id(%d) ts_ns(%lu) orig_ns(%lu) time_ns(%lu) err_ns(%lu)\n", (int)time_id, timestamp, tm_val_ns, value_ns, error_ns); + printf("ts-diff(%lu) tm-diff(%lu)\n", timestamp2 - timestamp1, value_ns2 - value_ns1); +} + +// print profiler features void print_features(rocprofiler_feature_t* feature, uint32_t feature_count) { for (rocprofiler_feature_t* p = feature; p < feature + feature_count; ++p) { std::cout << (p - feature) << ": " << p->name; @@ -82,7 +120,7 @@ int main() { rocprofiler_properties_t properties; // Profiling feature objects - const unsigned feature_count = 9; + const unsigned feature_count = 6; rocprofiler_feature_t feature[feature_count]; // PMC events memset(feature, 0, sizeof(feature)); @@ -98,12 +136,12 @@ int main() { feature[4].name = "SQ_INSTS_VALU"; feature[5].kind = ROCPROFILER_FEATURE_KIND_METRIC; feature[5].name = "VALUInsts"; - feature[6].kind = ROCPROFILER_FEATURE_KIND_METRIC; - feature[6].name = "TCC_HIT_sum"; - feature[7].kind = ROCPROFILER_FEATURE_KIND_METRIC; - feature[7].name = "TCC_MISS_sum"; - feature[8].kind = ROCPROFILER_FEATURE_KIND_METRIC; - feature[8].name = "WRITE_SIZE"; +// feature[6].kind = ROCPROFILER_FEATURE_KIND_METRIC; +// feature[6].name = "TCC_HIT_sum"; +// feature[7].kind = ROCPROFILER_FEATURE_KIND_METRIC; +// feature[7].name = "TCC_MISS_sum"; +// feature[8].kind = ROCPROFILER_FEATURE_KIND_METRIC; +// feature[8].name = "WRITE_SIZE"; // feature[8].kind = ROCPROFILER_FEATURE_KIND_METRIC; // feature[8].name = "TCC_EA_WRREQ_sum"; // feature[9].kind = ROCPROFILER_FEATURE_KIND_METRIC; @@ -176,5 +214,9 @@ int main() { status = rocprofiler_close(context); TEST_STATUS(status == HSA_STATUS_SUCCESS); + print_sys_time(CLOCK_REALTIME, ROCPROFILER_TIME_ID_CLOCK_REALTIME); + sleep(1); + print_sys_time(CLOCK_MONOTONIC, ROCPROFILER_TIME_ID_CLOCK_MONOTONIC); + return (ret_val) ? 0 : 1; } diff --git a/test/run.sh b/test/run.sh index 4612fa1c..4c985d3e 100755 --- a/test/run.sh +++ b/test/run.sh @@ -32,9 +32,12 @@ fi test_status=0 test_runnum=0 test_number=0 +failed_tests="Failed tests:" + xeval_test() { test_number=$test_number } + eval_test() { label=$1 cmdline=$2 @@ -44,6 +47,7 @@ eval_test() { eval "$cmdline" if [ $? != 0 ] ; then echo "$label: FAILED" + failed_tests="$failed_tests\n $test_number: \"$label\"" test_status=$(($test_status + 1)) else echo "$label: PASSED" @@ -52,18 +56,22 @@ eval_test() { test_number=$((test_number + 1)) } -# enable tools load failure reporting -export HSA_TOOLS_REPORT_LOAD_FAILURE=1 # paths to ROC profiler and oher libraries export LD_LIBRARY_PATH=$PWD + +# enable tools load failure reporting +export HSA_TOOLS_REPORT_LOAD_FAILURE=1 # enable error messages logging to '/tmp/rocprofiler_log.txt' export ROCPROFILER_LOG=1 -# ROC profiler metrics config file +# enable error messages logging to '/tmp/aql_profile_log.txt' +export HSA_VEN_AMD_AQLPROFILE_LOG=1 +# test trace +export ROC_TEST_TRACE=1 + +# Disabple profiler own proxy queue unset ROCP_PROXY_QUEUE # ROC profiler metrics config file export ROCP_METRICS=metrics.xml -# test trace -export ROC_TEST_TRACE=1 ## C test eval_test "C test" ./test/c_test @@ -72,17 +80,26 @@ eval_test "C test" ./test/c_test unset HSA_TOOLS_LIB unset ROCP_TOOL_LIB eval_test "Standalone sampling usage model test" ./test/standalone_test +# Standalone intercepting test +# ROC profiler library loaded by HSA runtime +export HSA_TOOLS_LIB=librocprofiler64.so.1 +# enable intercepting mode in rocprofiler +export ROCP_HSA_INTERCEPT=2 +# test macro for kernel iterations number +export ROCP_KITER=100 +# test macro for per-kernel dispatching number +export ROCP_DITER=10 +eval_test "Standalone intercepting test" ./test/stand_intercept_test +unset ROCP_HSA_INTERCEPT ## Intercepting usage model test -# ROC profiler library loaded by HSA runtime -export HSA_TOOLS_LIB=librocprofiler64.so # tool library loaded by ROC profiler export ROCP_TOOL_LIB=./test/libintercept_test.so export ROCP_KITER=50 export ROCP_DITER=50 export ROCP_AGENTS=1 export ROCP_THRS=3 -eval_test "Intercepting usage model test" "../bin/run_tool.sh ./test/ctrl" +eval_test "Intercepting usage model test" ./test/ctrl ## Libtool test # tool library loaded by ROC profiler @@ -111,6 +128,16 @@ export ROCP_THRS=10 export ROCP_INPUT=input1.xml eval_test "'rocprof' libtool test n-threads" ./test/ctrl +## SPM test +# export ROCP_KITER=3 +# export ROCP_DITER=3 +# export ROCP_AGENTS=1 +# export ROCP_THRS=1 +# export ROCP_INPUT=spm_input.xml +# export ROCP_SPM=1 +# eval_test "libtool test, SPM trace test" ./test/ctrl +# unset ROCP_SPM + ## Libtool test, counter sets # Memcopies tracking export ROCP_MCOPY_TRACKING=1 @@ -121,13 +148,30 @@ export ROCP_INPUT=input2.xml eval_test "libtool test, counter sets" ./test/ctrl ## OpenCL test -export ROCP_OBJ_TRACKING=1 +#export ROCP_OBJ_TRACKING=1 +#export ROCP_INPUT=input1.xml +#eval_test "libtool test, OpenCL sample" ./test/ocl/SimpleConvolution + +# Memcopies tracking +unset ROCP_MCOPY_TRACKING +# enable HSA intercepting +export ROCP_HSA_INTERC=1 + +export ROCP_KITER=10 +export ROCP_DITER=10 export ROCP_INPUT=input1.xml -eval_test "libtool test, OpenCL sample" ./test/ocl/SimpleConvolution +eval_test "libtool test, counter sets" ./test/ctrl + +## OpenCL test +#export ROCP_OBJ_TRACKING=1 +#eval_test "libtool test, OpenCL sample" ./test/ocl/SimpleConvolution #valgrind --leak-check=full $tbin #valgrind --tool=massif $tbin #ms_print massif.out. echo "$test_number tests total / $test_runnum tests run / $test_status tests failed" +if [ $test_status != 0 ] ; then + echo $failed_tests +fi exit $test_status diff --git a/test/tool/tool.cpp b/test/tool/tool.cpp index 81626a2a..e216b7fd 100644 --- a/test/tool/tool.cpp +++ b/test/tool/tool.cpp @@ -56,6 +56,13 @@ THE SOFTWARE. #define DESTRUCTOR_API __attribute__((destructor)) #define KERNEL_NAME_LEN_MAX 128 +#define ONLOAD_TRACE(str) \ + if (getenv("ROCP_ONLOAD_TRACE")) do { \ + std::cout << "PID(" << GetPid() << "): PROF_TOOL_LIB::" << __FUNCTION__ << " " << str << std::endl << std::flush; \ + } while(0); +#define ONLOAD_TRACE_BEG() ONLOAD_TRACE("begin") +#define ONLOAD_TRACE_END() ONLOAD_TRACE("end") + // Disoatch callback data type struct callbacks_data_t { rocprofiler_feature_t* features; @@ -139,8 +146,11 @@ bool is_trace_local = true; // SPM trace enabled bool is_spm_trace = false; +static inline uint32_t GetPid() { return syscall(__NR_getpid); } static inline uint32_t GetTid() { return syscall(__NR_gettid); } +uint32_t my_pid = GetPid(); + // Error handler void fatal(const std::string msg) { fflush(stdout); @@ -475,15 +485,16 @@ bool dump_context_entry(context_entry_t* entry) { const std::string nik_name = (to_truncate_names == 0) ? entry->data.kernel_name : filtr_kernel_name(entry->data.kernel_name); const AgentInfo* agent_info = HsaRsrcFactory::Instance().GetAgentInfo(entry->agent); - fprintf(file_handle, "dispatch[%u], gpu-id(%u), queue-id(%u), queue-index(%lu), tid(%lu), grd(%u), wgr(%u), lds(%u), scr(%u), vgpr(%u), sgpr(%u), fbar(%u), sig(0x%lx), kernel-name(\"%s\")", + fprintf(file_handle, "dispatch[%u], gpu-id(%u), queue-id(%u), queue-index(%lu), pid(%u), tid(%u), grd(%u), wgr(%u), lds(%u), scr(%u), vgpr(%u), sgpr(%u), fbar(%u), sig(0x%lx), kernel-name(\"%s\")", index, agent_info->dev_index, entry->data.queue_id, entry->data.queue_index, + my_pid, entry->data.thread_id, entry->kernel_properties.grid_size, entry->kernel_properties.workgroup_size, - (entry->kernel_properties.lds_size * (128 * 4)), + (entry->kernel_properties.lds_size + (AgentInfo::lds_block_size - 1)) & ~(AgentInfo::lds_block_size - 1), entry->kernel_properties.scratch_size, (entry->kernel_properties.vgpr_count + 1) * agent_info->vgpr_block_size, (entry->kernel_properties.sgpr_count + agent_info->sgpr_block_dflt) * agent_info->sgpr_block_size, @@ -659,7 +670,7 @@ hsa_status_t dispatch_callback(const rocprofiler_callback_data_t* callback_data, uint64_t workgroup_size = packet->workgroup_size_x * packet->workgroup_size_y * packet->workgroup_size_z; if (workgroup_size > UINT32_MAX) abort(); kernel_properties_ptr->workgroup_size = (uint32_t)workgroup_size; - kernel_properties_ptr->lds_size = AMD_HSA_BITS_GET(kernel_code->compute_pgm_rsrc2, AMD_COMPUTE_PGM_RSRC_TWO_GRANULATED_LDS_SIZE); // packet->group_segment_size; + kernel_properties_ptr->lds_size = packet->group_segment_size; kernel_properties_ptr->scratch_size = packet->private_segment_size; kernel_properties_ptr->vgpr_count = AMD_HSA_BITS_GET(kernel_code->compute_pgm_rsrc1, AMD_COMPUTE_PGM_RSRC_ONE_GRANULATED_WORKITEM_VGPR_COUNT); kernel_properties_ptr->sgpr_count = AMD_HSA_BITS_GET(kernel_code->compute_pgm_rsrc1, AMD_COMPUTE_PGM_RSRC_ONE_GRANULATED_WAVEFRONT_SGPR_COUNT); @@ -826,9 +837,66 @@ static inline void check_env_var(const char* var_name, uint64_t& val) { if (str != NULL ) val = atoll(str); } +// HSA intercepting routines + +// HSA unified callback function +hsa_status_t hsa_unified_callback( + rocprofiler_hsa_cb_id_t id, + const rocprofiler_hsa_callback_data_t* data, + void* arg) +{ + printf("hsa_unified_callback(%d, %p, %p):\n", (int)id, data, arg); + if (data == NULL) abort(); + + switch (id) { + case ROCPROFILER_HSA_CB_ID_ALLOCATE: + printf(" alloc ptr = %p\n", data->allocate.ptr); + printf(" alloc size = %zu\n", data->allocate.size); + printf(" segment type = 0x%x\n", data->allocate.segment); + printf(" global flag = 0x%x\n", data->allocate.global_flag); + printf(" is_code = %x\n", data->allocate.is_code); + break; + case ROCPROFILER_HSA_CB_ID_DEVICE: + printf(" device type = 0x%x\n", data->device.type); + printf(" device id = %u\n", data->device.id); + printf(" device agent = 0x%lx\n", data->device.agent.handle); + printf(" assigned ptr = %p\n", data->device.ptr); + break; + case ROCPROFILER_HSA_CB_ID_MEMCOPY: + printf(" memcopy dst = %p\n", data->memcopy.dst); + printf(" memcopy src = %p\n", data->memcopy.src); + printf(" memcopy size = %zu\n", data->memcopy.size); + break; + case ROCPROFILER_HSA_CB_ID_SUBMIT: + printf(" packet %p\n", data->submit.packet); + if (data->submit.kernel_name != NULL) { + printf(" submit kernel \"%s\"\n", data->submit.kernel_name); + printf(" device type = %u\n", data->submit.device_type); + printf(" device id = %u\n", data->submit.device_id); + } + break; + default: + printf("Unknown callback id(%u)\n", id); + abort(); + } + + fflush(stdout); + return HSA_STATUS_SUCCESS; +} + +// HSA callbacks structure +rocprofiler_hsa_callbacks_t hsa_callbacks { + hsa_unified_callback, + hsa_unified_callback, + hsa_unified_callback, + hsa_unified_callback +}; + // Tool constructor extern "C" PUBLIC_API void OnLoadToolProp(rocprofiler_settings_t* settings) { + ONLOAD_TRACE_BEG(); + if (pthread_mutex_lock(&mutex) != 0) { perror("pthread_mutex_lock"); abort(); @@ -855,7 +923,7 @@ extern "C" PUBLIC_API void OnLoadToolProp(rocprofiler_settings_t* settings) } if (rcfile != NULL) { // Getting defaults - printf("ROCProfiler: rc-file '%s'\n", rcpath.c_str()); + printf("ROCProfiler pid(%u): rc-file '%s'\n", GetPid(), rcpath.c_str()); auto defaults_list = rcfile->GetNodes("top.defaults"); for (auto* entry : defaults_list) { const auto& opts = entry->opts; @@ -908,6 +976,9 @@ extern "C" PUBLIC_API void OnLoadToolProp(rocprofiler_settings_t* settings) check_env_var("ROCP_OBJ_TRACKING", settings->code_obj_tracking); // Set memcopies tracking check_env_var("ROCP_MCOPY_TRACKING", settings->memcopy_tracking); + // Set HSA intercepting + check_env_var("ROCP_HSA_INTERC", settings->hsa_intercepting); + if (settings->hsa_intercepting) rocprofiler_set_hsa_callbacks(hsa_callbacks, (void*)14); is_trace_local = settings->trace_local; @@ -936,7 +1007,7 @@ extern "C" PUBLIC_API void OnLoadToolProp(rocprofiler_settings_t* settings) abort(); } std::ostringstream oss; - oss << result_prefix << "/results.txt"; + oss << result_prefix << "/" << GetPid() << "_results.txt"; result_file_handle = fopen(oss.str().c_str(), "w"); if (result_file_handle == NULL) { std::ostringstream errmsg; @@ -1046,10 +1117,14 @@ extern "C" PUBLIC_API void OnLoadToolProp(rocprofiler_settings_t* settings) if (err) { errno = err; perror("pthread_attr_init"); abort(); } err = pthread_create(&thread, &attr, monitor_thr_fun, NULL); } + + ONLOAD_TRACE_END(); } // Tool destructor -extern "C" PUBLIC_API void OnUnloadTool() { +void rocprofiler_unload(bool is_destr) { + ONLOAD_TRACE("begin loaded(" << is_loaded << ") destr(" << is_destr << ")"); + if (pthread_mutex_lock(&mutex) != 0) { perror("pthread_mutex_lock"); abort(); @@ -1061,6 +1136,8 @@ extern "C" PUBLIC_API void OnUnloadTool() { abort(); } + if (is_destr) CTX_OUTSTANDING_WAIT = 0; + // Unregister dispatch callback rocprofiler_remove_queue_callbacks(); @@ -1080,6 +1157,7 @@ extern "C" PUBLIC_API void OnUnloadTool() { } fflush(stdout); +#if 0 // Cleanup if (callbacks_data != NULL) { delete[] callbacks_data->features; @@ -1096,8 +1174,19 @@ extern "C" PUBLIC_API void OnUnloadTool() { range_vec = NULL; delete context_array; context_array = NULL; +#endif + + ONLOAD_TRACE_END(); +} + +extern "C" PUBLIC_API void OnUnloadTool() { + ONLOAD_TRACE("begin loaded(" << is_loaded << ")"); + if (is_loaded == true) rocprofiler_unload(false); + ONLOAD_TRACE_END(); } extern "C" DESTRUCTOR_API void destructor() { - if (is_loaded == true) OnUnloadTool(); + ONLOAD_TRACE("begin loaded(" << is_loaded << ")"); + if (is_loaded == true) rocprofiler_unload(true); + ONLOAD_TRACE_END(); } diff --git a/test/util/hsa_rsrc_factory.cpp b/test/util/hsa_rsrc_factory.cpp index d23a445d..10f9fbc1 100644 --- a/test/util/hsa_rsrc_factory.cpp +++ b/test/util/hsa_rsrc_factory.cpp @@ -144,6 +144,12 @@ HsaRsrcFactory::HsaRsrcFactory(bool initialize_hsa) : initialize_hsa_(initialize CHECK_STATUS("HSA timer allocation failed", (timer_ == NULL) ? HSA_STATUS_ERROR : HSA_STATUS_SUCCESS); + // Time correlation + const uint32_t corr_iters = 1000; + for (unsigned time_id = 0; time_id < HsaTimer::TIME_ID_NUMBER; time_id += 1) { + CorrelateTime((HsaTimer::time_id_t)time_id, corr_iters); + } + // System timeout timeout_ = (timeout_ns_ == HsaTimer::TIMESTAMP_MAX) ? timeout_ns_ : timer_->ns_to_sysclock(timeout_ns_); } @@ -512,21 +518,25 @@ uint8_t* HsaRsrcFactory::AllocateCmdMemory(const AgentInfo* agent_info, size_t s } // Wait signal -void HsaRsrcFactory::SignalWait(const hsa_signal_t& signal) const { +hsa_signal_value_t HsaRsrcFactory::SignalWait(const hsa_signal_t& signal, const hsa_signal_value_t& signal_value) const { + const hsa_signal_value_t exp_value = signal_value - 1; + hsa_signal_value_t ret_value = signal_value; while (1) { - const hsa_signal_value_t signal_value = - hsa_api_.hsa_signal_wait_scacquire(signal, HSA_SIGNAL_CONDITION_LT, 1, timeout_, HSA_WAIT_STATE_BLOCKED); - if (signal_value == 0) { - break; - } else { - CHECK_STATUS("hsa_signal_wait_scacquire()", HSA_STATUS_ERROR); + ret_value = + hsa_api_.hsa_signal_wait_scacquire(signal, HSA_SIGNAL_CONDITION_LT, signal_value, timeout_, HSA_WAIT_STATE_BLOCKED); + if (ret_value == exp_value) break; + if (ret_value != signal_value) { + std::cerr << "Error: HsaRsrcFactory::SignalWait: signal_value(" << signal_value + << "), ret_value(" << ret_value << ")" << std::endl << std::flush; + abort(); } } + return ret_value; } // Wait signal with signal value restore void HsaRsrcFactory::SignalWaitRestore(const hsa_signal_t& signal, const hsa_signal_value_t& signal_value) const { - SignalWait(signal); + SignalWait(signal, signal_value); hsa_api_.hsa_signal_store_relaxed(const_cast(signal), signal_value); } @@ -539,7 +549,7 @@ bool HsaRsrcFactory::Memcpy(const hsa_agent_t& agent, void* dst, const void* src CHECK_STATUS("hsa_signal_create()", status); status = hsa_api_.hsa_amd_memory_async_copy(dst, cpu_agents_[0], src, agent, size, 0, NULL, s); CHECK_STATUS("hsa_amd_memory_async_copy()", status); - SignalWait(s); + SignalWait(s, 1); status = hsa_api_.hsa_signal_destroy(s); CHECK_STATUS("hsa_signal_destroy()", status); } @@ -683,14 +693,14 @@ uint64_t HsaRsrcFactory::Submit(hsa_queue_t* queue, const void* packet, size_t s return write_idx; } -const char* HsaRsrcFactory::GetKernelName(uint64_t addr) { +const char* HsaRsrcFactory::GetKernelNameRef(uint64_t addr) { std::lock_guard lck(mutex_); const auto it = symbols_map_->find(addr); if (it == symbols_map_->end()) { fprintf(stderr, "HsaRsrcFactory::kernel addr (0x%lx) is not found\n", addr); abort(); } - return strdup(it->second); + return it->second; } void HsaRsrcFactory::EnableExecutableTracking(HsaApiTable* table) { diff --git a/test/util/hsa_rsrc_factory.h b/test/util/hsa_rsrc_factory.h index 151dab8e..e857813b 100644 --- a/test/util/hsa_rsrc_factory.h +++ b/test/util/hsa_rsrc_factory.h @@ -25,6 +25,8 @@ POSSIBILITY OF SUCH DAMAGE. #ifndef TEST_UTIL_HSA_RSRC_FACTORY_H_ #define TEST_UTIL_HSA_RSRC_FACTORY_H_ +#define AMD_INTERNAL_BUILD + #include #include #include @@ -35,6 +37,7 @@ POSSIBILITY OF SUCH DAMAGE. #include #include #include +#include #include #include @@ -161,10 +164,11 @@ struct AgentInfo { // Number of Shader Arrays Per Shader Engines in Gpu uint32_t shader_arrays_per_se; - // SGPR/VGPR block sizes + // SGPR/VGPR/LDS block sizes uint32_t sgpr_block_dflt; uint32_t sgpr_block_size; uint32_t vgpr_block_size; + static const uint32_t lds_block_size = 128 * 4; }; // HSA timer class @@ -175,6 +179,15 @@ class HsaTimer { static const timestamp_t TIMESTAMP_MAX = UINT64_MAX; typedef long double freq_t; + enum time_id_t { + TIME_ID_CLOCK_REALTIME = 0, + TIME_ID_CLOCK_REALTIME_COARSE = 1, + TIME_ID_CLOCK_MONOTONIC = 2, + TIME_ID_CLOCK_MONOTONIC_COARSE = 3, + TIME_ID_CLOCK_MONOTONIC_RAW = 4, + TIME_ID_NUMBER + }; + HsaTimer(const hsa_pfn_t* hsa_api) : hsa_api_(hsa_api) { timestamp_t sysclock_hz = 0; hsa_status_t status = hsa_api_->hsa_system_get_info(HSA_SYSTEM_INFO_TIMESTAMP_FREQUENCY, &sysclock_hz); @@ -190,6 +203,11 @@ class HsaTimer { return timestamp_t((freq_t)time / sysclock_factor_); } + // Method for timespec/ns conversion + static timestamp_t timespec_to_ns(const timespec& time) { + return ((timestamp_t)time.tv_sec * 1000000000) + time.tv_nsec; + } + // Return timestamp in 'ns' timestamp_t timestamp_ns() const { timestamp_t sysclock; @@ -198,6 +216,63 @@ class HsaTimer { return sysclock_to_ns(sysclock); } + // Return time in 'ns' + timestamp_t clocktime_ns(clockid_t clock_id) const { + timespec time; + clock_gettime(clock_id, &time); + return timespec_to_ns(time); + } + + // Return pair of correlated values of profiling timestamp and time with + // correlation error for a given time ID and number of iterations + void correlated_pair_ns(time_id_t time_id, uint32_t iters, + timestamp_t* timestamp_v, timestamp_t* time_v, timestamp_t* error_v) { + clockid_t clock_id = 0; + switch (time_id) { + case TIME_ID_CLOCK_REALTIME: + clock_id = CLOCK_REALTIME; + break; + case TIME_ID_CLOCK_REALTIME_COARSE: + clock_id = CLOCK_REALTIME_COARSE; + break; + case TIME_ID_CLOCK_MONOTONIC: + clock_id = CLOCK_MONOTONIC; + break; + case TIME_ID_CLOCK_MONOTONIC_COARSE: + clock_id = CLOCK_MONOTONIC_COARSE; + break; + case TIME_ID_CLOCK_MONOTONIC_RAW: + clock_id = CLOCK_MONOTONIC_RAW; + break; + default: + CHECK_STATUS("internal error: invalid time_id", HSA_STATUS_ERROR); + } + + std::vector ts_vec(iters); + std::vector tm_vec(iters); + const uint32_t steps = iters - 1; + + for (uint32_t i = 0; i < iters; ++i) { + hsa_api_->hsa_system_get_info(HSA_SYSTEM_INFO_TIMESTAMP, &ts_vec[i]); + clock_gettime(clock_id, &tm_vec[i]); + } + + const timestamp_t ts_base = sysclock_to_ns(ts_vec.front()); + const timestamp_t tm_base = timespec_to_ns(tm_vec.front()); + const timestamp_t error = (ts_vec.back() - ts_vec.front()) / (2 * steps); + + timestamp_t ts_accum = 0; + timestamp_t tm_accum = 0; + for (uint32_t i = 0; i < iters; ++i) { + ts_accum += (ts_vec[i] - ts_base); + tm_accum += (timespec_to_ns(tm_vec[i]) - tm_base); + } + + *timestamp_v = (ts_accum / iters) + ts_base + error; + *time_v = (tm_accum / iters) + tm_base; + *error_v = error; + } + private: // Timestamp frequency factor freq_t sysclock_factor_; @@ -299,7 +374,7 @@ class HsaRsrcFactory { uint8_t* AllocateCmdMemory(const AgentInfo* agent_info, size_t size); // Wait signal - void SignalWait(const hsa_signal_t& signal) const; + hsa_signal_value_t SignalWait(const hsa_signal_t& signal, const hsa_signal_value_t& signal_value) const; // Wait signal with signal value restore void SignalWaitRestore(const hsa_signal_t& signal, const hsa_signal_value_t& signal_value) const; @@ -331,7 +406,7 @@ class HsaRsrcFactory { // Enable executables loading tracking static bool IsExecutableTracking() { return executable_tracking_on_; } static void EnableExecutableTracking(HsaApiTable* table); - static const char* GetKernelName(uint64_t addr); + static const char* GetKernelNameRef(uint64_t addr); // Initialize HSA API table void static InitHsaApiTable(HsaApiTable* table); @@ -357,6 +432,26 @@ class HsaRsrcFactory { if (instance_ != NULL) Instance().timeout_ = Instance().timer_->ns_to_sysclock(time); } + void CorrelateTime(HsaTimer::time_id_t time_id, uint32_t iters) { + timestamp_t timestamp_v = 0; + timestamp_t time_v = 0; + timestamp_t error_v = 0; + timer_->correlated_pair_ns(time_id, iters, ×tamp_v, &time_v, &error_v); + time_shift_[time_id] = time_v - timestamp_v; + time_error_[time_id] = error_v; + } + + hsa_status_t GetTimeVal(uint32_t time_id, uint64_t time_stamp, uint64_t* time_value) { + if (time_id >= HsaTimer::TIME_ID_NUMBER) return HSA_STATUS_ERROR; + *time_value = time_stamp + time_shift_[time_id]; + return HSA_STATUS_SUCCESS; + } + + hsa_status_t GetTimeErr(uint32_t time_id, uint64_t* err) { + *err = time_error_[time_id]; + return HSA_STATUS_SUCCESS; + } + private: // System agents iterating callback static hsa_status_t GetHsaAgentsCallback(hsa_agent_t agent, void* data); @@ -421,6 +516,10 @@ class HsaRsrcFactory { // HSA timer HsaTimer* timer_; + // Time shift array to support time conversion + timestamp_t time_shift_[HsaTimer::TIME_ID_NUMBER]; + timestamp_t time_error_[HsaTimer::TIME_ID_NUMBER]; + // CPU/kern-arg memory pools hsa_amd_memory_pool_t *cpu_pool_; hsa_amd_memory_pool_t *kern_arg_pool_; From 08d86aaa3a2ac9a4717a050a74d0f49c2cdfc048 Mon Sep 17 00:00:00 2001 From: Evgeny Date: Mon, 17 Aug 2020 02:04:08 -0500 Subject: [PATCH 124/153] 3.7 update --- CMakeLists.txt | 1 + bin/build_kernel.sh | 25 +- bin/mem_manager.py | 216 ++++++++++++++++ bin/rpl_run.sh | 2 +- bin/sqlitedb.py | 22 +- bin/tblextr.py | 298 +++++++++++++-------- bin/txt2xml.sh | 9 +- inc/rocprofiler.h | 14 +- src/core/activity.cpp | 99 ------- src/core/context.h | 165 +++++------- src/core/hsa_interceptor.h | 63 ++++- src/core/intercept_queue.cpp | 3 + src/core/intercept_queue.h | 243 ++++++++++++++++-- src/core/profile.h | 101 ++++---- src/core/rocprofiler.cpp | 39 +-- src/core/tracker.h | 43 ++++ src/util/hsa_rsrc_factory.cpp | 5 +- test/app/standalone_test.cpp | 13 - test/run.sh | 69 +++-- test/tool/pmc_input.xml | 4 + test/tool/pmc_input1.xml | 14 + test/tool/tool.cpp | 456 ++++++++++++++++++--------------- test/util/hsa_rsrc_factory.cpp | 79 +++--- test/util/hsa_rsrc_factory.h | 94 ++++++- 24 files changed, 1390 insertions(+), 687 deletions(-) create mode 100755 bin/mem_manager.py create mode 100644 test/tool/pmc_input.xml create mode 100644 test/tool/pmc_input1.xml diff --git a/CMakeLists.txt b/CMakeLists.txt index 8aac5175..e6765e47 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -144,6 +144,7 @@ install ( FILES ${CMAKE_CURRENT_SOURCE_DIR}/bin/txt2params.py ${CMAKE_CURRENT_SOURCE_DIR}/bin/tblextr.py ${CMAKE_CURRENT_SOURCE_DIR}/bin/dform.py + ${CMAKE_CURRENT_SOURCE_DIR}/bin/mem_manager.py ${CMAKE_CURRENT_SOURCE_DIR}/bin/sqlitedb.py DESTINATION ${DEST_NAME}/bin PERMISSIONS OWNER_READ OWNER_EXECUTE GROUP_READ GROUP_EXECUTE WORLD_READ WORLD_EXECUTE ) diff --git a/bin/build_kernel.sh b/bin/build_kernel.sh index 8ed0f168..4b2f87dd 100755 --- a/bin/build_kernel.sh +++ b/bin/build_kernel.sh @@ -40,8 +40,17 @@ else LIB_DIR=$LLVM_DIR/lib fi -BC_DIR=$LIB_DIR/bitcode -if [ ! -d "$BC_DIR" ] ; then BC_DIR=$LIB_DIR; fi +# Determine whether using new or old device-libs layout +if [ -e $LIB_DIR/bitcode/opencl.amdgcn.bc ]; then + BC_DIR=$LIB_DIR/bitcode +elif [ -e $LIB_DIR/opencl.amdgcn.bc ]; then + BC_DIR=$LIB_DIR +elif [ -e $ROCM_DIR/amdgcn/bitcode/opencl.bc ]; then + BC_DIR=$ROCM_DIR/amdgcn/bitcode +else + echo "Error: Cannot find amdgcn bitcode directory" + exit 1 +fi CLANG_ROOT=$LLVM_DIR/lib/clang CLANG_DIR=`ls -d $CLANG_ROOT/* | head -n 1` @@ -52,10 +61,14 @@ fi BIN_DIR=$LLVM_DIR/bin INC_DIR=$CLANG_DIR/include -BITCODE_OPTS="\ - -Xclang -mlink-bitcode-file -Xclang $BC_DIR/opencl.amdgcn.bc \ - -Xclang -mlink-bitcode-file -Xclang $BC_DIR/ockl.amdgcn.bc \ - -Xclang -mlink-bitcode-file -Xclang $BC_DIR/ocml.amdgcn.bc" +if [ -e $BC_DIR/opencl.amdgcn.bc ]; then + BITCODE_OPTS="-nogpulib \ + -Xclang -mlink-bitcode-file -Xclang $BC_DIR/opencl.amdgcn.bc \ + -Xclang -mlink-bitcode-file -Xclang $BC_DIR/ockl.amdgcn.bc \ + -Xclang -mlink-bitcode-file -Xclang $BC_DIR/ocml.amdgcn.bc" +else + BITCODE_OPTS="--hip-device-lib-path=$BC_DIR" +fi for GFXIP in $TGT_LIST ; do OBJ_PREF=$GFXIP diff --git a/bin/mem_manager.py b/bin/mem_manager.py new file mode 100755 index 00000000..8b616cc6 --- /dev/null +++ b/bin/mem_manager.py @@ -0,0 +1,216 @@ +################################################################################ +# Copyright (c) 2018 Advanced Micro Devices, Inc. All rights reserved. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +# THE SOFTWARE. +################################################################################ + +import sys, os, re +from sqlitedb import SQLiteDB + +pinned = ['hipMallocHost', 'hipHostMalloc', 'hipHostAlloc'] +ondevice = ['hipMalloc', 'hipMallocPitch', 'hipMallocArray', 'hipMalloc3DArray'] + +mm_table_descr = [ + ['BeginNs', 'EndNs', 'pid', 'tid', 'Name', 'Direction', 'SrcType', 'DstType', 'Size', 'BW', 'Async'], + {'BeginNs':'INTEGER', 'EndNs':'INTEGER', 'pid':'INTEGER', 'tid':'INTEGER', 'Name':'TEXT', 'Direction':'TEXT', 'SrcType':'TEXT', 'DstType':'TEXT', 'Size':'INTEGER', 'BW':'TEXT', 'Async':'TEXT'} +] + +def fatal(msg): + sys.stderr.write(sys.argv[0] + ": " + msg + "\n"); + sys.exit(1) + +DELIM = ',' + +# Mem copy manager class +class MemManager: + + def __init__(self, db): + self.db = db + self.allocations = {} + self.memcopies = {} + self.filename = '' + self.fd = 0 + + def __del__(self): + if self.fd != 0: self.fd.close() + + # register allo and memcpy API calls + def register_api(self, rec_vals): + res = '' + malloc_ptrn = re.compile(r'hip.*Malloc') + mcopy_ptrn = re.compile(r'hipMemcpy') + record_name = rec_vals[4] + record_args = rec_vals[5] + if malloc_ptrn.match(record_name): + self.add_allocation(record_name, record_args) + elif mcopy_ptrn.match(record_name): + res = self.add_memcpy(rec_vals) + + return res + + # add allocation to map + def add_allocation(self, event, args): + choice = 0 + if event == "hipMallocPitch": + malloc_args_ptrn = re.compile(r'\(ptr\((.*)\) width\((.*)\) height\((.*)\)\)') + choice = 1 + elif event == "hipMallocArray": + malloc_args_ptrn = re.compile(r'\(array\((.*)\) width\((.*)\) height\((.*)\)\)') + choice = 1 + elif event == "hipMalloc3DArray": + malloc_args_ptrn = re.compile(r'\(array\((.*)\) width\((.*)\) height\((.*)\) depth\((.*)\)\)') + choice = 2 + else: + #(ptr(0x7f3407000000) size(800000000) flags(0)) + malloc_args_ptrn = re.compile(r'\(ptr\((.*)\) size\((.*)\) .*\)') + choice = 3 + m = malloc_args_ptrn.match(args) + if m: + ptr = int(m.group(1), 16) + if choice == 3: + size = int(m.group(2)) + elif choice == 1: + size = int(m.group(2)) * int(m.group(3)) + else: + size = int(m.group(2)) * int(m.group(3)) * int(m.group(4)) + self.allocations[ptr] = (size, event) + + #get type of ptr + def get_ptr_type(self, ptr): + addr = int(ptr, 16) + addr_type = 'unknown' + found = 0 + for base, (size, event) in self.allocations.items(): + if addr >= base and addr < base + size: + found = 1 + break + if not found: + addr_type = 'pageable' + elif event in pinned: + addr_type = 'pinned' + elif event in ondevice: + addr_type = 'device' + else: + fatal('internal error: ptr(' + ptr + ') cannot be identified') + return addr_type + + # add memcpy to map + def add_memcpy(self, recvals): + recordid = recvals[6] #same as corrid + event = recvals[4] + start_time = recvals[0] # sync time stamp + end_time = recvals[1] # sync time stamp + args = recvals[5] + procid = recvals[2] # used to query async entries + pid = recvals[2] + tid = recvals[3] + + select_expr = '"Index" = ' + str(recordid) + ' AND "proc-id" = ' + str(procid) + + # hipMemcpy(void* dst, const void* src, size_t sizeBytes, hipMemcpyKind kind) + hipMemcpy_ptrn = re.compile(r'\(dst\((.*)\) src\((.*)\) sizeBytes\((\d+)\).*\)') + # hipMemcpy2D(void* dst, size_t dpitch, const void* src, size_t spitch, size_t width, + # size_t height, hipMemcpyKind kind); + hipMemcpy_ptrn2 = re.compile(r'\(dst\((.*)\) .* src\((.*)\) .* width\((\d+)\) height\((\d+)\).*\)') + # hipMemcpyToArray(hipArray* dst, size_t wOffset, size_t hOffset, const void* src, + # size_t count, hipMemcpyKind kind); + hipMemcpy_ptrn3 = re.compile(r'\(dst\((.*)\) .* src\((.*)\) count\((\d+)\).*\)') + # memcopy with kind argument + hipMemcpy_ptrn_kind = re.compile(r'.* kind\((\d+)\)\s*.*') + # aysnc memcopy + async_event_ptrn = re.compile(r'Async') + + m_basic = hipMemcpy_ptrn.match(args) + m_2d = hipMemcpy_ptrn2.match(args) + m_array = hipMemcpy_ptrn3.match(args) + + is_async = 1 if async_event_ptrn.search(event) else 0 + if is_async: + async_copy_recvals = self.db.table_get_record('COPY', select_expr) #List of async copy record fields + async_copy_start_time = async_copy_recvals[0] + async_copy_end_time = async_copy_recvals[1] + tid = async_copy_recvals[4] + + copy_line = '' + size = 0 + dstptr_type = 'unknown' + srcptr_type = 'unknown' + direction = 'unknown' + bandwidth = 0 + duration = 0 + + switcher = { + '0': "HtoH", + '1': "HtoD", + '2': "DtoH", + '3': "DtoD", + '4': "auto", + } + + if m_basic or m_2d or m_array: + if m_basic: + dstptr = m_basic.group(1) + dstptr_type = self.get_ptr_type(dstptr) + srcptr = m_basic.group(2) + srcptr_type = self.get_ptr_type(srcptr) + size = int(m_basic.group(3)) + if m_array: + dstptr = m_array.group(1) + dstptr_type = self.get_ptr_type(dstptr) + srcptr = m_array.group(2) + srcptr_type = self.get_ptr_type(srcptr) + size = m_array.group(3) + if m_2d: + dstptr = m_2d.group(1) + dstptr_type = self.get_ptr_type(dstptr) + srcptr = m_2d.group(2) + srcptr_type = self.get_ptr_type(srcptr) + size = m_2d.group(3)*m_2d.group(4) + + duration = (int(end_time) - int(start_time)) if not is_async else (int(async_copy_end_time) - int(async_copy_start_time)) + bandwidth = float(size) * 1000 / duration + + m = hipMemcpy_ptrn_kind.match(args) + if m: + direction = switcher.get(m.group(1), "unknown") + + copy_line = str(start_time) + DELIM + str(end_time) + DELIM + pid + DELIM + tid + DELIM + event + DELIM + 'Direction=' + direction + DELIM + 'SrcType=' + srcptr_type + DELIM + 'DstType=' + dstptr_type + DELIM + "Size=" + str(size) + DELIM + "BW=" + str(round(bandwidth, 2)) + DELIM + 'Async=' + str(is_async) + + self.memcopies[recordid] = copy_line + return copy_line; + + def dump_data(self): + # To create “MM” table in DB on the finish + table_name = "MM" + file_name = os.environ['PWD'] + '/results.memcopy_info.csv' + print("File '" + file_name + "' is generating") + table_handle = self.db.add_table(table_name, mm_table_descr) + + fld_ptrn = re.compile(r'(.*)=(.*)') + for (key, record) in self.memcopies.items(): + rec_vals_array = [] + for rec in record.split(DELIM): + fld_ptrnm = fld_ptrn.match(rec) + if fld_ptrnm: + rec_vals_array.append(fld_ptrnm.group(2)) + else: + rec_vals_array.append(rec) + self.db.insert_entry(table_handle, rec_vals_array) + # To dump the MM table as CSV + self.db.dump_csv(table_name, file_name) diff --git a/bin/rpl_run.sh b/bin/rpl_run.sh index 0c3d83d4..e98561b4 100755 --- a/bin/rpl_run.sh +++ b/bin/rpl_run.sh @@ -202,7 +202,7 @@ usage() { # checking for availability of rocminfo utility `which rocminfo >/dev/null 2>&1` -if [ $? != 0 ]; then fatal "'rocminfo' utility is not found: please add ROCM bin path to PATH env var."; fi +if [ $? != 0 ]; then error "'rocminfo' utility is not found: please add ROCM bin path to PATH env var."; fi # profiling run method OUTPUT_LIST="" diff --git a/bin/sqlitedb.py b/bin/sqlitedb.py index eb584503..62553a81 100644 --- a/bin/sqlitedb.py +++ b/bin/sqlitedb.py @@ -76,6 +76,11 @@ def change_rec_tid(self, table_name, rec_id, tid): self.connection.execute('UPDATE ' + table_name + ' SET tid = ? WHERE "Index" = ?', (tid, rec_id)) def change_rec_fld(self, table_name, fld_expr, rec_pat): self.connection.execute('UPDATE ' + table_name + ' SET ' + fld_expr + ' WHERE ' + rec_pat) + def table_get_record(self, table_name, rec_pat): + cursor = self.connection.execute('SELECT * FROM ' + table_name + ' WHERE ' + rec_pat) + raws = cursor.fetchall() + if len(raws) != 1: raise Exception('Record (' + rec_pat + ') is not unique, table "' + table_name + '"') + return list(raws[0]) # populate DB table entry def insert_entry(self, table, val_list): @@ -109,8 +114,7 @@ def _get_raws_indexed(self, table_name): def _get_raw_by_id(self, table_name, rec_id): cursor = self.connection.execute('SELECT * FROM ' + table_name + ' WHERE "Index"=?', (rec_id,)) raws = cursor.fetchall() - if len(raws) != 1: - raise Exception('Index is not unique, table "' + table_name + '"') + if len(raws) != 1: raise Exception('Index is not unique, table "' + table_name + '"') return list(raws[0]) def table_get_raws(self, table_name): @@ -147,20 +151,20 @@ def label_json(self, pid, label, file_name): fd.write(',{"args":{"name":"%s"},"ph":"M","pid":%s,"name":"process_name","sort_index":%d}\n' %(label, pid, self.section_index)) self.section_index += 1 - def flow_json(self, base_id, from_pid, from_tid, from_us_list, to_pid, to_us_dict, corr_id_list, start_us, file_name): + def flow_json(self, base_id, from_pid, from_us_list, to_pid, to_us_dict, corr_id_list, start_us, file_name): if not re.search(r'\.json$', file_name): raise Exception('wrong output file type: "' + file_name + '"' ) with open(file_name, mode='a') as fd: dep_id = base_id - for ind in range(len(from_tid)): - if (len(corr_id_list) != 0): corr_id = corr_id_list[ind] - else: corr_id = ind + for ind in range(len(from_us_list)): + corr_id = corr_id_list[ind] if (len(corr_id_list) != 0) else ind if corr_id in to_us_dict: - from_ts = from_us_list[ind] - start_us + (from_ts, from_tid, to_tid) = from_us_list[ind] + from_ts -= start_us to_ts = to_us_dict[corr_id] - start_us if from_ts > to_ts: from_ts = to_ts - fd.write(',{"ts":%d,"ph":"s","cat":"DataFlow","id":%d,"pid":%s,"tid":%s,"name":"dep"}\n' % (from_ts, dep_id, str(from_pid), from_tid[ind])) - fd.write(',{"ts":%d,"ph":"t","cat":"DataFlow","id":%d,"pid":%s,"tid":0,"name":"dep"}\n' % (to_ts, dep_id, str(to_pid))) + fd.write(',{"ts":%d,"ph":"s","cat":"DataFlow","id":%d,"pid":%d,"tid":%d,"name":"dep"}\n' % (from_ts, dep_id, from_pid, from_tid)) + fd.write(',{"ts":%d,"ph":"t","cat":"DataFlow","id":%d,"pid":%d,"tid":%d,"name":"dep"}\n' % (to_ts, dep_id, to_pid, to_tid)) dep_id += 1 def metadata_json(self, jsonfile, sysinfo_file): diff --git a/bin/tblextr.py b/bin/tblextr.py index 0fe46336..60d99db3 100755 --- a/bin/tblextr.py +++ b/bin/tblextr.py @@ -22,8 +22,11 @@ import os, sys, re, subprocess from sqlitedb import SQLiteDB +from mem_manager import MemManager import dform +mcopy_data_enabled = 1 if 'ROCP_MCOPY_DATA' in os.environ else 0 + EXT_PID = 0 COPY_PID = 1 HIP_PID = 2 @@ -42,6 +45,28 @@ dep_dict = {} kern_dep_list = [] +# stream ID map +stream_counter = 0 +stream_id_map = {} +def get_stream_index(stream_id): + global stream_counter + stream_ind = 0 + if stream_id.lower() != 'nil': + if not stream_id in stream_id_map: + stream_counter += 1 + stream_ind = stream_counter + stream_id_map[stream_id] = stream_ind + else: + stream_ind = stream_id_map[stream_id] + return stream_ind + +# patching activity records +def activity_record_patching(db, ops_table_name, kernel_found, kernel_name, stream_found, stream_ind, select_expr): + if kernel_found != 0: + db.change_rec_fld(ops_table_name, 'Name = "' + kernel_name + '"', select_expr) + if stream_found != 0: + db.change_rec_fld(ops_table_name, 'tid = ' + str(stream_ind), select_expr) + # global vars table_descr = [ ['Index', 'KernelName'], @@ -112,6 +137,8 @@ def parse_res(infile): } gpu_id = 0 + queue_id = 0 + disp_pid = 0 disp_tid = 0 kernel_properties = m.group(2) @@ -125,7 +152,9 @@ def parse_res(infile): if var == 'gpu-id': gpu_id = int(val) if (gpu_id > max_gpu_id): max_gpu_id = gpu_id - if var == 'tid': disp_tid = val + if var == 'queue-id': queue_id = int(val) + if var == 'pid': disp_pid = int(val) + if var == 'tid': disp_tid = int(val) else: fatal('wrong kernel property "' + prop + '" in "'+ kernel_properties + '"') m = ts_pattern.search(record) if m: @@ -134,20 +163,22 @@ def parse_res(infile): var_table[dispatch_number]['EndNs'] = m.group(3) var_table[dispatch_number]['CompleteNs'] = m.group(4) - gpu_pid = GPU_BASE_PID + int(gpu_id) - if not gpu_pid in dep_dict: dep_dict[gpu_pid] = {} - dep_str = dep_dict[gpu_pid] - if not 'tid' in dep_str: dep_str['tid'] = [] - if not 'from' in dep_str: dep_str['from'] = [] - if not 'to' in dep_str: dep_str['to'] = {} - to_id = len(dep_str['tid']) - from_us = int(m.group(1)) / 1000 + ## filling dependenciws + from_ns = m.group(1) + from_us = int(from_ns) / 1000 to_us = int(m.group(2)) / 1000 + + kern_dep_list.append((from_ns, disp_pid, disp_tid)) + + gpu_pid = GPU_BASE_PID + int(gpu_id) + if not disp_pid in dep_dict: dep_dict[disp_pid] = {} + dep_proc = dep_dict[disp_pid] + if not gpu_pid in dep_proc: dep_proc[gpu_pid] = { 'pid': HSA_PID, 'from': [], 'to': {}, 'id': [] } + dep_str = dep_proc[gpu_pid] + to_id = len(dep_str['from']) + dep_str['from'].append((from_us, disp_tid, queue_id)) dep_str['to'][to_id] = to_us - dep_str['from'].append(from_us) - dep_str['tid'].append(disp_tid) - dep_str['pid'] = HSA_PID - kern_dep_list.append((disp_tid, m.group(1))) + ## inp.close() ############################################################# @@ -259,19 +290,25 @@ def fill_ext_db(table_name, db, indir, trace_name, api_pid): return 1 ############################################################# - -def extract_field(rec_args, field): - ptrn1_field = re.compile(r'^.*'+field+'\('); - ptrn2_field = re.compile(r'\)\) .*$'); - (field_name, n_subs) = ptrn1_field.subn('', rec_args, count=1); - if n_subs != 0: - (field_name, n_subs) = ptrn2_field.subn(')', field_name, count=1) - return (field_name, n_subs) +# arguments manipulation routines +def get_field(args, field): + ptrn1_field = re.compile(r'^.* ' + field + '\('); + ptrn2_field = re.compile(r'\) .*$'); + ptrn3_field = re.compile(r'\)\)$'); + (field_name, n) = ptrn1_field.subn('', args, count=1); + if n != 0: + (field_name, n) = ptrn2_field.subn('', field_name, count=1) + if n == 0: + (field_name, n) = ptrn3_field.subn('', field_name, count=1) + return (field_name, n) + +def set_field(args, field, val): + return re.subn(field + '\(\w+\)([ \)])', field + '(' + str(val) + ')\\1', args, count=1) # Fill API DB api_table_descr = [ - ['BeginNs', 'EndNs', 'pid', 'tid', 'Name', 'args', 'Index'], - {'BeginNs':'INTEGER', 'EndNs':'INTEGER', 'pid':'INTEGER', 'tid':'INTEGER', 'Name':'TEXT', 'args':'TEXT', 'Index':'INTEGER'} + ['BeginNs', 'EndNs', 'pid', 'tid', 'Name', 'args', 'Index', 'Data'], + {'BeginNs':'INTEGER', 'EndNs':'INTEGER', 'pid':'INTEGER', 'tid':'INTEGER', 'Name':'TEXT', 'args':'TEXT', 'Index':'INTEGER', 'Data':'TEXT'} ] # Filling API records DB table # table_name - created DB table name @@ -284,6 +321,7 @@ def extract_field(rec_args, field): # dep_filtr - registered dependencies by record ID def fill_api_db(table_name, db, indir, api_name, api_pid, dep_pid, dep_list, dep_filtr, expl_id): global hsa_activity_found + global memory_manager copy_raws = [] if (hsa_activity_found): copy_raws = db.table_get_raws('COPY') copy_csv = '' @@ -296,6 +334,7 @@ def fill_api_db(table_name, db, indir, api_name, api_pid, dep_pid, dep_list, dep ptrn2_kernel = re.compile(r'\)\) .*$') ptrn_fixformat = re.compile(r'(\d+:\d+ \d+:\d+ \w+)\(\s*(.*)\)$') ptrn_fixkernel = re.compile(r'\s+kernel=(.*)$') + ptrn_multi_kernel = re.compile(r'(.*):(\d+)$') if not os.path.isfile(file_name): return 0 @@ -304,7 +343,7 @@ def fill_api_db(table_name, db, indir, api_name, api_pid, dep_pid, dep_list, dep dep_id_list = [] # parsing an input trace file and creating a DB table - record_id = 0 + record_id_dict = {} table_handle = db.add_table(table_name, api_table_descr) with open(file_name, mode='r') as fd: for line in fd.readlines(): @@ -319,27 +358,49 @@ def fill_api_db(table_name, db, indir, api_name, api_pid, dep_pid, dep_list, dep mfixformat = ptrn_fixformat.match(record) if mfixformat: #replace '=' in args with parentheses reformated_args = kernel_arg + mfixformat.group(2).replace('=','(').replace(',',')')+')' - record = mfixformat.group(1) + '(' + reformated_args + ')' + record = mfixformat.group(1) + '( ' + reformated_args + ')' m = ptrn_val.match(record) if m: rec_vals = [] - rec_len = len(api_table_descr[0]) - for ind in range(1,rec_len): + rec_len = len(api_table_descr[0]) - 1 + for ind in range(1, rec_len): rec_vals.append(m.group(ind)) - proc_id = rec_vals[2] - rec_vals[2] = api_pid - rec_vals.append(record_id) - db.insert_entry(table_handle, rec_vals) + proc_id = int(rec_vals[2]) + thrd_id = int(rec_vals[3]) + record_name = rec_vals[4] + record_args = rec_vals[5] + + # incrementing per-process record id/correlation id + if not proc_id in record_id_dict: record_id_dict[proc_id] = 0 + corr_id = record_id_dict[proc_id] + record_id_dict[proc_id] += 1 + rec_vals.append(corr_id) + + # extracting/converting stream id + (stream_id, stream_found) = get_field(record_args, 'stream') + if stream_found != 0: + stream_id = get_stream_index(stream_id) + (rec_vals[5], found) = set_field(record_args, 'stream', stream_id) + if found == 0: fatal('set_field() failed for "stream", args: "' + record_args + '"') + else: stream_id = 0 # dependencies filling - if ptrn_ac.search(rec_vals[4]) or record_id in dep_filtr: + if ptrn_ac.search(record_name) or (corr_id, proc_id) in dep_filtr: beg_ns = int(rec_vals[0]) end_ns = int(rec_vals[1]) from_us = (beg_ns / 1000) + ((end_ns - beg_ns) / 1000) - dep_from_us_list.append(from_us) - dep_tid_list.append(int(rec_vals[3])) - dep_id_list.append(record_id) + + if not proc_id in dep_dict: dep_dict[proc_id] = {} + dep_proc = dep_dict[proc_id] + found = 1 if dep_pid in dep_proc else 0 + if found == 0 and dep_pid == OPS_PID: + dep_proc[dep_pid] = { 'pid': api_pid, 'from': [], 'id': [] } + found = 1 + if found == 1: + dep_str = dep_proc[dep_pid] + dep_str['from'].append((from_us, thrd_id, stream_id)) + if expl_id: dep_str['id'].append(corr_id) # memcopy data if len(copy_raws) != 0: @@ -347,40 +408,50 @@ def fill_api_db(table_name, db, indir, api_name, api_pid, dep_pid, dep_list, dep args_str = rec_vals[5] args_str = re.sub(r'\(', r'', args_str) args_str = re.sub(r'\).*$', r'', args_str) - copy_line = str(copy_data[0]) + ', ' + str(copy_data[1]) + ', ' + rec_vals[4] + ', ' + args_str + copy_line = str(copy_data[0]) + ', ' + str(copy_data[1]) + ', ' + record_name + ', ' + args_str copy_csv += str(copy_index) + ', ' + copy_line + '\n' copy_index += 1 # patching activity properties: kernel name, stream-id - corr_id = record_id if (corr_id, proc_id) in dep_filtr: + ops_table_name = dep_filtr[(corr_id, proc_id)] + + select_expr = '"Index" = ' + str(corr_id) + ' AND "proc-id" = ' + str(proc_id) record_args = rec_vals[rec_len - 2] - select_expr = '"Index" = ' + str(corr_id) + ' AND "proc-id" = ' + proc_id - # extract kernel name - (kernel_name, n_subs) = extract_field(record_args, 'kernel') - if n_subs != 0: - db.change_rec_fld('OPS', 'Name = "' + kernel_name + '"', select_expr) - # extract stream-id - (stream_id, n_subs) = extract_field(record_args, 'stream') - if n_subs != 0: - if stream_id == 'nil' or stream_id == 'NIL': stream_id = 0 - db.change_rec_fld('OPS', 'tid = ' + stream_id, select_expr) - record_id += 1 + # extract kernel name string + (kernel_str, kernel_found) = get_field(record_args, 'kernel') + is_kernel_list = 1 if kernel_found != 0 and kernel_str[-1] == ';' else 0 + + if is_kernel_list != 0: + for kernel_item in kernel_str[:-1].split(';'): + m = ptrn_multi_kernel.match(kernel_item) + if m: + kernel_name = m.group(1) + dev_id = m.group(2) + select_expr += ' AND "dev-id" = ' + dev_id + activity_record_patching(db, ops_table_name, 1, kernel_name, stream_found, stream_id, select_expr) + else: + fatal('Bad multi-kernel format: "' + kernel_item + '" in "' + kernel_str + '"') + else: + activity_record_patching(db, ops_table_name, kernel_found, kernel_str, stream_found, stream_id, select_expr) + + api_data = '' + if mcopy_data_enabled: + api_data = memory_manager.register_api(rec_vals) if len(dep_filtr) else '' + rec_vals.append(api_data) + + rec_vals[2] = api_pid + + db.insert_entry(table_handle, rec_vals) else: fatal(api_name + " bad record: '" + record + "'") # inserting of dispatch events correlated to the dependent dispatches - for (tid, from_ns) in dep_list: - db.insert_entry(table_handle, [from_ns, from_ns, api_pid, tid, 'hsa_dispatch', '', record_id]) - record_id += 1 - - # registering dependencies informatino - if dep_pid != NONE_PID: - if not dep_pid in dep_dict: dep_dict[dep_pid] = {} - dep_dict[dep_pid]['pid'] = api_pid - dep_dict[dep_pid]['tid'] = dep_tid_list - dep_dict[dep_pid]['from'] = dep_from_us_list - if expl_id: dep_dict[dep_pid]['id'] = dep_id_list + for (from_ns, proc_id, thrd_id) in dep_list: + if not proc_id in record_id_dict: record_id_dict[proc_id] = 0 + corr_id = record_id_dict[proc_id] + record_id_dict[proc_id] += 1 + db.insert_entry(table_handle, [from_ns, from_ns, api_pid, thrd_id, 'hsa_dispatch', '', corr_id, '']) # generating memcopy CSV if copy_csv != '': @@ -394,19 +465,17 @@ def fill_api_db(table_name, db, indir, api_name, api_pid, dep_pid, dep_list, dep # fill COPY DB copy_table_descr = [ - ['BeginNs', 'EndNs', 'Name', 'pid', 'tid', 'Index'], - {'Index':'INTEGER', 'Name':'TEXT', 'args':'TEXT', 'BeginNs':'INTEGER', 'EndNs':'INTEGER', 'pid':'INTEGER', 'tid':'INTEGER'} + ['BeginNs', 'EndNs', 'Name', 'pid', 'tid', 'Index', 'proc-id'], + {'Index':'INTEGER', 'proc-id':'INTEGER', 'Name':'TEXT', 'args':'TEXT', 'BeginNs':'INTEGER', 'EndNs':'INTEGER', 'pid':'INTEGER', 'tid':'INTEGER'} ] def fill_copy_db(table_name, db, indir): + pid = COPY_PID file_name = indir + '/' + 'async_copy_trace.txt' ptrn_val = re.compile(r'(\d+):(\d+) (.*)$') - ptrn_id = re.compile(r'^async-copy(\d+)$') + ptrn_id = re.compile(r'^async-copy:(\d+):(\d+)$') if not os.path.isfile(file_name): return 0 - if not COPY_PID in dep_dict: dep_dict[COPY_PID] = {} - dep_to_us_dict = {} - table_handle = db.add_table(table_name, copy_table_descr) with open(file_name, mode='r') as fd: for line in fd.readlines(): @@ -417,22 +486,32 @@ def fill_copy_db(table_name, db, indir): for ind in range(1,4): rec_vals.append(m.group(ind)) rec_vals.append(COPY_PID) rec_vals.append(0) + m = ptrn_id.match(rec_vals[2]) - if m: dep_to_us_dict[int(m.group(1))] = int(rec_vals[0]) / 1000 - else: fatal("bad async-copy entry") - rec_vals.append(m.group(1)) + if not m: fatal("bad async-copy entry '" + record + "'") + corr_id = int(m.group(1)) + proc_id = int(m.group(2)) + rec_vals.append(corr_id) + rec_vals.append(proc_id) + db.insert_entry(table_handle, rec_vals) - else: fatal("async-copy bad record: '" + record + "'") - dep_dict[COPY_PID]['to'] = dep_to_us_dict + # filling dependencies + if not proc_id in dep_dict: dep_dict[proc_id] = {} + dep_proc = dep_dict[proc_id] + if not pid in dep_proc: dep_proc[pid] = { 'pid': HSA_PID, 'from': [], 'to': {}, 'id': [] } + dep_str = dep_proc[pid] + dep_str['to'][corr_id] = int(rec_vals[0]) / 1000 + + else: fatal("async-copy bad record: '" + record + "'") return 1 ############################################################# # fill HCC ops DB ops_table_descr = [ - ['BeginNs', 'EndNs', 'dev-id', 'queue-id', 'Name', 'pid', 'tid', 'Index', 'proc-id'], - {'Index':'INTEGER', 'proc-id':'INTEGER', 'Name':'TEXT', 'args':'TEXT', 'BeginNs':'INTEGER', 'EndNs':'INTEGER', 'dev-id':'INTEGER', 'queue-id':'INTEGER', 'pid':'INTEGER', 'tid':'INTEGER'} + ['BeginNs', 'EndNs', 'dev-id', 'queue-id', 'Name', 'pid', 'tid', 'Index', 'proc-id', 'Data'], + {'Index':'INTEGER', 'proc-id':'INTEGER', 'Name':'TEXT', 'args':'TEXT', 'BeginNs':'INTEGER', 'EndNs':'INTEGER', 'dev-id':'INTEGER', 'queue-id':'INTEGER', 'pid':'INTEGER', 'tid':'INTEGER', 'Data':'TEXT'} ] def fill_ops_db(kernel_table_name, mcopy_table_name, db, indir): global max_gpu_id @@ -446,7 +525,6 @@ def fill_ops_db(kernel_table_name, mcopy_table_name, db, indir): filtr = {} - record_id = 0 kernel_table_handle = db.add_table(kernel_table_name, ops_table_descr) mcopy_table_handle = db.add_table(mcopy_table_name, ops_table_descr) with open(file_name, mode='r') as fd: @@ -462,13 +540,15 @@ def fill_ops_db(kernel_table_name, mcopy_table_name, db, indir): if not m: fatal("bad hcc ops entry '" + record + "'") name = m.group(1) corr_id = int(m.group(2)) - 1 - proc_id = m.group(3) + proc_id = int(m.group(3)) # checking name for memcopy pattern if ptrn_mcopy.search(name): + rec_table_name = mcopy_table_name table_handle = mcopy_table_handle pid = COPY_PID; else: + rec_table_name = kernel_table_name table_handle = kernel_table_handle gpu_id = int(rec_vals[2]); @@ -484,16 +564,18 @@ def fill_ops_db(kernel_table_name, mcopy_table_name, db, indir): rec_vals.append(0) # tid rec_vals.append(corr_id) # Index rec_vals.append(proc_id) # proc-id + rec_vals.append('') # Data db.insert_entry(table_handle, rec_vals) # registering a dependency filtr - filtr[(corr_id, proc_id)] = 1 + filtr[(corr_id, proc_id)] = rec_table_name - # filling a dependency - if not pid in dep_dict: dep_dict[pid] = {} - if not 'to' in dep_dict[pid]: dep_dict[pid]['to'] = {} - dep_dict[pid]['to'][corr_id] = int(rec_vals[0]) / 1000 - dep_dict[pid]['bsp'] = OPS_PID + # filling a dependencies + if not proc_id in dep_dict: dep_dict[proc_id] = {} + dep_proc = dep_dict[proc_id] + if not pid in dep_proc: dep_proc[pid] = { 'bsp': OPS_PID, 'to': {} } + dep_str = dep_proc[pid] + dep_str['to'][corr_id] = int(rec_vals[0]) / 1000 else: fatal("hcc ops bad record: '" + record + "'") @@ -512,6 +594,16 @@ def fill_ops_db(kernel_table_name, mcopy_table_name, db, indir): dbfile = '' csvfile = '' +begin_ts_file = indir + '/begin_ts_file.txt' +if os.path.isfile(begin_ts_file): + with open(begin_ts_file, mode='r') as fd: + ind = 0 + for line in fd.readlines(): + val = int(line) / 1000 + if ind == 0 or val < START_US: START_US = val + ind += 1 + print('START timestamp found (' + str(START_US) + 'us)') + if re.search(r'\.csv$', outfile): csvfile = outfile elif re.search(r'\.db$', outfile): @@ -540,6 +632,7 @@ def fill_ops_db(kernel_table_name, mcopy_table_name, db, indir): with open(dbfile, mode='w') as fd: fd.truncate() db = SQLiteDB(dbfile) + memory_manager = MemManager(db) ext_trace_found = fill_ext_db('rocTX', db, indir, 'roctx', EXT_PID) @@ -613,32 +706,35 @@ def fill_ops_db(kernel_table_name, mcopy_table_name, db, indir): dform.gen_api_json_trace(db, 'KFD', START_US, jsonfile) if any_trace_found: - for (to_pid, dep_str) in dep_dict.items(): - if 'bsp' in dep_str: - bspid = dep_str['bsp'] - base_str = dep_dict[bspid] - for v in ('pid', 'tid', 'from', 'id'): - dep_str[v] = base_str[v] - base_str['inv'] = 1 - dep_id = 0 - for (to_pid, dep_str) in dep_dict.items(): - if 'inv' in dep_str: continue - if not 'to' in dep_str: continue - - to_us_dict = dep_str['to'] - from_us_list = dep_str['from'] - from_pid = dep_str['pid'] - tid_list = dep_str['tid'] - corr_id_list = [] - if 'id' in dep_str: corr_id_list = dep_str['id'] - - db.flow_json(dep_id, from_pid, tid_list, from_us_list, to_pid, to_us_dict, corr_id_list, START_US, jsonfile) - dep_id += len(tid_list) + for (proc_id, dep_proc) in dep_dict.items(): + for (to_pid, dep_str) in dep_proc.items(): + if 'bsp' in dep_str: + bspid = dep_str['bsp'] + base_str = dep_proc[bspid] + for v in ('pid', 'from', 'id'): + dep_str[v] = base_str[v] + base_str['inv'] = 1 + + for (to_pid, dep_str) in dep_proc.items(): + if 'inv' in dep_str: continue + if not 'to' in dep_str: continue + + from_pid = dep_str['pid'] + from_us_list = dep_str['from'] + to_us_dict = dep_str['to'] + corr_id_list = dep_str['id'] + + db.flow_json(dep_id, from_pid, from_us_list, to_pid, to_us_dict, corr_id_list, START_US, jsonfile) + dep_id += len(from_us_list) if any_trace_found: db.metadata_json(jsonfile, sysinfo_file) db.close_json(jsonfile); + + if mcopy_data_enabled: + memory_manager.dump_data() + db.close() sys.exit(0) diff --git a/bin/txt2xml.sh b/bin/txt2xml.sh index 126337ed..e5bc3e3d 100755 --- a/bin/txt2xml.sh +++ b/bin/txt2xml.sh @@ -64,7 +64,7 @@ parse() { gpu_index=$line fi else - found=$(echo $feature | sed -n "/^\(pmc\|sqtt\|hsa\)$/ p") + found=$(echo $feature | sed -n "/^\(pmc\|hsa\)$/ p") if [ -n "$found" ] ; then output=$outdir/input${index}.xml header="# $timestamp '$output' generated with '$0 $*'" @@ -78,13 +78,6 @@ parse() { EOF fi - if [ "$feature" == "sqtt" ] ; then - cat >> $output < - -EOF - fi - if [ "$feature" == "hsa" ] ; then cat >> $output < diff --git a/inc/rocprofiler.h b/inc/rocprofiler.h index 24925cae..3f295a15 100644 --- a/inc/rocprofiler.h +++ b/inc/rocprofiler.h @@ -72,6 +72,8 @@ typedef struct { uint64_t timeout; uint32_t timestamp_on; uint32_t hsa_intercepting; + uint32_t k_concurrent; + uint32_t opt_mode; } rocprofiler_settings_t; //////////////////////////////////////////////////////////////////////////////// @@ -90,8 +92,6 @@ hsa_status_t rocprofiler_error_string( typedef enum { ROCPROFILER_FEATURE_KIND_METRIC = 0, ROCPROFILER_FEATURE_KIND_TRACE = 1, - ROCPROFILER_FEATURE_KIND_SPM_MOD = 2, - ROCPROFILER_FEATURE_KIND_PCSMP_MOD = 4 } rocprofiler_feature_kind_t; // Profiling feture parameter @@ -478,7 +478,8 @@ typedef enum { ROCPROFILER_HSA_CB_ID_ALLOCATE = 0, // Memory allocate callback ROCPROFILER_HSA_CB_ID_DEVICE = 1, // Device assign callback ROCPROFILER_HSA_CB_ID_MEMCOPY = 2, // Memcopy callback - ROCPROFILER_HSA_CB_ID_SUBMIT = 3 // Packet submit callback + ROCPROFILER_HSA_CB_ID_SUBMIT = 3, // Packet submit callback + ROCPROFILER_HSA_CB_ID_KSYMBOL = 4 // Loading/unloading of kernel symbol } rocprofiler_hsa_cb_id_t; // HSA callback data type @@ -509,6 +510,12 @@ typedef struct { uint32_t device_type; // type of device the packed is submitted to uint32_t device_id; // id of device the packed is submitted to } submit; + struct { + uint64_t object; // kernel symbol object + const char* name; // kernel symbol name + uint32_t name_length; // kernel symbol name length + int destroy; // symbol executable destroy + } ksymbol; }; } rocprofiler_hsa_callback_data_t; @@ -524,6 +531,7 @@ typedef struct { rocprofiler_hsa_callback_fun_t device; // agent assign callback rocprofiler_hsa_callback_fun_t memcopy; // memory copy callback rocprofiler_hsa_callback_fun_t submit; // packet submit callback + rocprofiler_hsa_callback_fun_t ksymbol; // kernel symbol callback } rocprofiler_hsa_callbacks_t; // Set callbacks. If the callback is NULL then it is disabled. diff --git a/src/core/activity.cpp b/src/core/activity.cpp index c72977e1..19f6bea3 100644 --- a/src/core/activity.cpp +++ b/src/core/activity.cpp @@ -55,92 +55,6 @@ void check_status(hsa_status_t status) { } } -// Activity primitives -namespace activity_prim { -// PC sampling callback data -struct pcsmp_callback_data_t { - const char* kernel_name; // sampled kernel name - void* data_buffer; // host buffer for tracing data - uint64_t id; // sample id - uint64_t cycle; // sample cycle - uint64_t pc; // sample PC -}; - -uint32_t activity_op = UINT32_MAX; -void* activity_arg = NULL; -std::atomic activity_callback{NULL}; -rocprofiler_t* context = NULL; - -hsa_status_t trace_data_cb(hsa_ven_amd_aqlprofile_info_type_t info_type, - hsa_ven_amd_aqlprofile_info_data_t* info_data, - void* data) { - const pcsmp_callback_data_t* pcsmp_data = (pcsmp_callback_data_t*) data; - - activity_record_t record{}; - record.op = activity_op; - record.pc_sample.se = pcsmp_data->id; - record.pc_sample.cycle = pcsmp_data->cycle; - record.pc_sample.pc = pcsmp_data->pc; - activity_async_callback_t fun = activity_callback.load(std::memory_order_acquire); - if (fun) { - (fun)(activity_op, &record, activity_arg); - } else { - free((void*)(pcsmp_data->kernel_name)); - } - return HSA_STATUS_SUCCESS; -} - -bool context_handler(rocprofiler_group_t group, void* arg) { - hsa_agent_t agent{}; - hsa_status_t status = rocprofiler_get_agent(group.context, &agent); - check_status(status); - const rocprofiler::util::AgentInfo* agent_info = rocprofiler::util::HsaRsrcFactory::Instance().GetAgentInfo(agent); - - pcsmp_callback_data_t pcsmp_data{}; - pcsmp_data.kernel_name = (const char*)arg; - pcsmp_data.data_buffer = rocprofiler::util::HsaRsrcFactory::Instance().AllocateSysMemory(agent_info, rocprofiler::TraceProfile::GetSize()); - status = rocprofiler_iterate_trace_data(group.context, trace_data_cb, &pcsmp_data); - check_status(status); - return false; -} - -// Kernel disoatch callback -hsa_status_t dispatch_callback(const rocprofiler_callback_data_t* callback_data, void* user_data, - rocprofiler_group_t* group) { - // context features - const rocprofiler_feature_kind_t trace_kind = - (rocprofiler_feature_kind_t)(ROCPROFILER_FEATURE_KIND_TRACE | ROCPROFILER_FEATURE_KIND_PCSMP_MOD); - const uint32_t feature_count = 1; - const uint32_t parameter_count = 1; - rocprofiler_feature_t* features = new rocprofiler_feature_t[feature_count]; - memset(features, 0, feature_count * sizeof(rocprofiler_feature_t)); - rocprofiler_parameter_t* parameters = new rocprofiler_parameter_t[parameter_count]; - memset(features, 0, parameter_count * sizeof(rocprofiler_parameter_t)); - parameters[0].parameter_name = HSA_VEN_AMD_AQLPROFILE_PARAMETER_NAME_COMPUTE_UNIT_TARGET; - parameters[0].value = 0; - - features[0].kind = trace_kind; - features[0].parameters = parameters; - features[0].parameter_count = parameter_count; - - // context properties - rocprofiler_properties_t properties{}; - properties.handler = context_handler; - properties.handler_arg = (void*)strdup(callback_data->kernel_name); - - // Open profiling context - hsa_status_t status = rocprofiler_open(callback_data->agent, features, feature_count, - &context, 0 /*ROCPROFILER_MODE_SINGLEGROUP*/, &properties); - check_status(status); - - // Get group[0] - status = rocprofiler_get_group(context, 0, group); - check_status(status); - - return status; -} -} // namespace activity_prim - extern "C" { PUBLIC_API const char* GetOpName(uint32_t op) { return strdup("PCSAMPLE"); } @@ -149,23 +63,10 @@ PUBLIC_API bool RegisterApiCallback(uint32_t op, void* callback, void* arg) { re PUBLIC_API bool RemoveApiCallback(uint32_t op) { return true; } PUBLIC_API bool InitActivityCallback(void* callback, void* arg) { - activity_prim::activity_arg = arg; - activity_prim::activity_callback.store((activity_async_callback_t)callback, std::memory_order_release); - - rocprofiler_queue_callbacks_t queue_callbacks{}; - queue_callbacks.dispatch = activity_prim::dispatch_callback; - rocprofiler_set_queue_callbacks(queue_callbacks, NULL); - return true; } PUBLIC_API bool EnableActivityCallback(uint32_t op, bool enable) { - if (enable) { - activity_prim::activity_op = op; - rocprofiler_start_queue_callbacks(); - } else { - rocprofiler_stop_queue_callbacks(); - } return true; } } // extern "C" diff --git a/src/core/context.h b/src/core/context.h index 7131d338..8be3a9e8 100644 --- a/src/core/context.h +++ b/src/core/context.h @@ -83,7 +83,6 @@ class Group { Group(const util::AgentInfo* agent_info, Context* context, const uint32_t& index) : pmc_profile_(agent_info), - trace_profile_(agent_info), n_profiles_(0), refs_(1), context_(context), @@ -96,33 +95,24 @@ class Group { case ROCPROFILER_FEATURE_KIND_METRIC: pmc_profile_.Insert(info); break; - case ROCPROFILER_FEATURE_KIND_TRACE: - trace_profile_.Insert(info); - break; default: EXC_RAISING(HSA_STATUS_ERROR, "bad rocprofiler feature kind (" << kind << ")"); } } - hsa_status_t Finalize() { - hsa_status_t status = pmc_profile_.Finalize(start_vector_, stop_vector_, read_vector_); - if (status == HSA_STATUS_SUCCESS) { - status = trace_profile_.Finalize(start_vector_, stop_vector_, read_vector_); - } + hsa_status_t Finalize(const bool is_concurrent = false) { + hsa_status_t status = pmc_profile_.Finalize(start_vector_, stop_vector_, + read_vector_, is_concurrent); if (status == HSA_STATUS_SUCCESS) { if (!pmc_profile_.Empty()) ++n_profiles_; - if (!trace_profile_.Empty()) ++n_profiles_; } return status; } void GetProfiles(profile_vector_t& vec) { pmc_profile_.GetProfiles(vec); - trace_profile_.GetProfiles(vec); } - void GetTraceProfiles(profile_vector_t& vec) { trace_profile_.GetProfiles(vec); } - info_vector_t& GetInfoVector() { return info_vector_; } const pkt_vector_t& GetStartVector() const { return start_vector_; } const pkt_vector_t& GetStopVector() const { return stop_vector_; } @@ -137,7 +127,6 @@ class Group { private: PmcProfile pmc_profile_; - TraceProfile trace_profile_; info_vector_t info_vector_; pkt_vector_t start_vector_; pkt_vector_t stop_vector_; @@ -183,7 +172,7 @@ class Context { uint32_t GetGroupCount() const { return set_.size(); } - inline rocprofiler_group_t GetGroupInfo(Group* g) { + inline rocprofiler_group_t GetGroupDescr(Group* g) { rocprofiler::info_vector_t& info_vector = g->GetInfoVector(); rocprofiler_group_t group = {}; group.index = g->GetIndex(); @@ -192,12 +181,12 @@ class Context { group.feature_count = info_vector.size(); return group; } - inline rocprofiler_group_t GetGroupInfo(const uint32_t& index) { + inline rocprofiler_group_t GetGroupDescr(const uint32_t& index) { rocprofiler_group_t group = {}; if (set_.empty()) { group.context = reinterpret_cast(this); } else { - group = GetGroupInfo(&set_[index]); + group = GetGroupDescr(&set_[index]); } return group; } @@ -272,15 +261,28 @@ class Context { } } - void IterateTraceData(rocprofiler_trace_data_callback_t callback, void* data) { - profile_vector_t profile_vector; - set_[0].GetTraceProfiles(profile_vector); + /* Handle the completion of kernel-begin 'read' packet */ + static bool HandlerRead(hsa_signal_value_t value, void* arg) { + Group* group = reinterpret_cast(arg); + Context* context = group->GetContext(); + + // Handle the completion signal of read packet at kernel begin + const profile_vector_t profile_vector = context->GetProfiles(group->GetIndex()); for (auto& tuple : profile_vector) { - if (pcsmp_mode_) const_cast(tuple.profile)->event_count = UINT32_MAX; - const hsa_status_t status = - api_->hsa_ven_amd_aqlprofile_iterate_data(tuple.profile, callback, data); - if (status != HSA_STATUS_SUCCESS) AQL_EXC_RAISING(status, "context iterate data failed"); + // Wait for read packet to complete + util::HsaRsrcFactory::Instance().SignalWaitRestore(tuple.completion_signal, 1); + const profile_t* profile = tuple.profile; + // Copy the counter values, read at kernel begin, to the right half of + // the buffer, so that the next kernel-end read can reuse the left half + char* data = reinterpret_cast(profile->output_buffer.ptr); + const uint32_t num = profile->output_buffer.size / 2; + for(uint32_t i = 0; i < num; ++i) { + data[i+num] = data[i]; // left --> right + data[i] = 0; // reset left + } } + + return false; } static bool Handler(hsa_signal_value_t value, void* arg) { @@ -288,8 +290,8 @@ class Context { Context* context = group->GetContext(); auto r = group->FetchDecrRefsCount(); if (r == 1) { - const rocprofiler_group_t group_info = context->GetGroupInfo(group); - context->handler_(group_info, context->handler_arg_); + const rocprofiler_group_t group_descr = context->GetGroupDescr(group); + context->handler_(group_descr, context->handler_arg_); } return false; } @@ -298,6 +300,25 @@ class Context { Group* GetGroup(const uint32_t& index) { return &set_[index]; } rocprofiler_handler_t GetHandler(void** arg) const { *arg = handler_arg_; return handler_; } + void SetDispatchSignal(const hsa_signal_t &signal) { + dispatch_signal_ = signal; + } + hsa_signal_t& GetDispatchSignal() { + return dispatch_signal_; + } + void SetOrigSignal(const hsa_signal_t &signal) { + orig_signal_ = signal; + } + const hsa_signal_t& GetOrigSignal() const { + return orig_signal_; + } + rocprofiler_dispatch_record_t* GetRecord() { + return &record_; + } + + // Concurrent profiling mode + static bool k_concurrent_; + private: Context(const util::AgentInfo* agent_info, Queue* queue, rocprofiler_feature_t* info, const uint32_t info_count, rocprofiler_handler_t handler, void* handler_arg) @@ -309,12 +330,16 @@ class Context { metrics_(NULL), handler_(handler), handler_arg_(handler_arg), - pcsmp_mode_(false) + pcsmp_mode_(false), + dispatch_signal_{}, + orig_signal_{}, + record_{} {} ~Context() { Destruct(); } void Destruct() { + hsa_signal_destroy(dispatch_signal_); for (const auto& v : info_map_) { const std::string& name = v.first; const rocprofiler_feature_t* info = v.second; @@ -349,12 +374,20 @@ class Context { set_[group_index].ResetRefsCount(); const profile_vector_t profile_vector = GetProfiles(group_index); for (auto& tuple : profile_vector) { + // Handler for read packet completion + if (k_concurrent_) { + hsa_amd_signal_async_handler(tuple.completion_signal, HSA_SIGNAL_CONDITION_LT, 1, HandlerRead, + &set_[group_index]); + } // Handler for stop packet completion hsa_amd_signal_async_handler(tuple.completion_signal, HSA_SIGNAL_CONDITION_LT, 1, Handler, &set_[group_index]); } } } + + hsa_status_t status = hsa_signal_create(1, 0, NULL, &dispatch_signal_); + if (status != HSA_STATUS_SUCCESS) EXC_RAISING(status, "MetricsDict create failed"); } // Initialize rocprofiler context @@ -437,23 +470,6 @@ class Context { const uint32_t group_index = block_status.group_index; set_[group_index].Insert(profile_info_t{event, NULL, 0, info}); } - } else if (kind & ROCPROFILER_FEATURE_KIND_TRACE) { // Processing traces features - info->kind = ROCPROFILER_FEATURE_KIND_TRACE; - - const event_t* event = NULL; - if (kind & ROCPROFILER_FEATURE_KIND_PCSMP_MOD) { // PC sampling - pcsmp_mode_ = true; - } else if (kind & ROCPROFILER_FEATURE_KIND_SPM_MOD) { // SPM trace - const Metric* metric = metrics_->Get(name); - if (metric == NULL) - EXC_RAISING(HSA_STATUS_ERROR, "input metric '" << name << "' is not found"); - counters_vec_t counters_vec = metric->GetCounters(); - if (counters_vec.size() != 1) - EXC_RAISING(HSA_STATUS_ERROR, "trace bad metric '" << name << "' is not base counter"); - const counter_t* counter = counters_vec[0]; - event = &(counter->event); - } - set_[0].Insert(profile_info_t{event, info->parameters, info->parameter_count, info}); } else { EXC_RAISING(HSA_STATUS_ERROR, "bad rocprofiler feature kind (" << kind << ")"); } @@ -464,7 +480,7 @@ class Context { void Finalize() { for (unsigned index = 0; index < set_.size(); ++index) { - const hsa_status_t status = set_[index].Finalize(); + const hsa_status_t status = set_[index].Finalize(k_concurrent_); if (status != HSA_STATUS_SUCCESS) EXC_RAISING(status, "context finalize failed"); } } @@ -502,56 +518,6 @@ class Context { if (ainfo_data->sample_id == 0) rinfo->data.result_int64 = 0; rinfo->data.result_int64 += ainfo_data->pmc_data.result; rinfo->data.kind = ROCPROFILER_DATA_KIND_INT64; - } else if (ainfo_type == HSA_VEN_AMD_AQLPROFILE_INFO_TRACE_DATA) { - if (rinfo->data.result_bytes.copy) { - const bool trace_local = TraceProfile::IsLocal(); - util::HsaRsrcFactory* hsa_rsrc = &util::HsaRsrcFactory::Instance(); - if (sample_id == 0) { - const uint32_t output_buffer_size = profile->output_buffer.size; - const uint32_t output_buffer_size64 = profile->output_buffer.size / sizeof(uint64_t); - const util::AgentInfo* agent_info = hsa_rsrc->GetAgentInfo(profile->agent); - void* ptr = (trace_local) ? hsa_rsrc->AllocateSysMemory(agent_info, output_buffer_size) : - calloc(output_buffer_size64, sizeof(uint64_t)); - rinfo->data.result_bytes.size = output_buffer_size; - rinfo->data.result_bytes.ptr = ptr; - callback_data->ptr = reinterpret_cast(ptr); - } - char* result_bytes_ptr = reinterpret_cast(rinfo->data.result_bytes.ptr); - const char* end = result_bytes_ptr + rinfo->data.result_bytes.size; - const char* src = reinterpret_cast(ainfo_data->trace_data.ptr); - uint32_t size = ainfo_data->trace_data.size; - char* ptr = callback_data->ptr; - uint32_t* header = reinterpret_cast(ptr); - char* dest = ptr + sizeof(*header); - - if ((dest + size) >= end) { - if (dest < end) size = end - dest; - else EXC_RAISING(HSA_STATUS_ERROR, "Trace data out of output buffer"); - } - - bool suc = true; - if (trace_local) { - suc = hsa_rsrc->Memcpy(profile->agent, dest, src, size); - } else { - memcpy(dest, src, size); - } - if (suc) { - *header = size; - callback_data->ptr = dest + align_size(size, sizeof(uint32_t)); - rinfo->data.result_bytes.instance_count = sample_id + 1; - rinfo->data.kind = ROCPROFILER_DATA_KIND_BYTES; - } else - EXC_RAISING(HSA_STATUS_ERROR, "Agent Memcpy failed, dst(" << (void*)dest << ") src(" << (void*)src << ") size(" << size << ")"); - } else { - if (sample_id == 0) { - rinfo->data.result_bytes.ptr = profile->output_buffer.ptr; - rinfo->data.result_bytes.size = profile->output_buffer.size; - rinfo->data.result_bytes.instance_count = UINT32_MAX; - } - - rinfo->data.result_bytes.instance_count += 1; - rinfo->data.kind = ROCPROFILER_DATA_KIND_BYTES; - } } else { EXC_RAISING(HSA_STATUS_ERROR, "unknown data type = " << ainfo_type); } @@ -593,8 +559,17 @@ class Context { // PC sampling mode bool pcsmp_mode_; + + // kernel packet dispatch copmletion signal + hsa_signal_t dispatch_signal_; + hsa_signal_t orig_signal_; + rocprofiler_dispatch_record_t record_; + }; +#define CONTEXT_INSTANTIATE() \ + bool rocprofiler::Context::k_concurrent_ = false; + } // namespace rocprofiler #endif // SRC_CORE_CONTEXT_H_ diff --git a/src/core/hsa_interceptor.h b/src/core/hsa_interceptor.h index f1d8a0d8..9207730b 100644 --- a/src/core/hsa_interceptor.h +++ b/src/core/hsa_interceptor.h @@ -25,6 +25,7 @@ SOFTWARE. #ifndef _SRC_CORE_HSA_INTERCEPTOR_H #define _SRC_CORE_HSA_INTERCEPTOR_H +#include #include #include #include @@ -49,7 +50,8 @@ SOFTWARE. (ID == ROCPROFILER_HSA_CB_ID_ALLOCATE) ? callbacks_.allocate: \ (ID == ROCPROFILER_HSA_CB_ID_DEVICE) ? callbacks_.device: \ (ID == ROCPROFILER_HSA_CB_ID_MEMCOPY) ? callbacks_.memcopy: \ - callbacks_.submit; \ + (ID == ROCPROFILER_HSA_CB_ID_SUBMIT) ? callbacks_.submit: \ + callbacks_.ksymbol; \ if ((__callback != NULL) && (recursion_ == false)) #define DO_HSA_CALLBACK \ @@ -62,6 +64,14 @@ SOFTWARE. #define ISSUE_HSA_CALLBACK(ID) \ do { IS_HSA_CALLBACK(ID) { DO_HSA_CALLBACK; } } while(0) +// Demangle C++ symbol name +static const char* cpp_demangle(const char* symname) { + size_t size = 0; + int status; + const char* ret = abi::__cxa_demangle(symname, NULL, &size, &status); + return (ret != 0) ? ret : strdup(symname); +} + namespace rocprofiler { extern decltype(hsa_memory_allocate)* hsa_memory_allocate_fn; extern decltype(hsa_memory_assign_agent)* hsa_memory_assign_agent_fn; @@ -337,6 +347,39 @@ class HsaInterceptor { return HSA_STATUS_SUCCESS; } + static hsa_status_t KernelSymbolCallback( + hsa_executable_t executable, + hsa_executable_symbol_t symbol, + void *arg) + { + const int free_flag = reinterpret_cast(arg); + hsa_symbol_kind_t kind = (hsa_symbol_kind_t)0; + HSA_RT(hsa_executable_symbol_get_info(symbol, HSA_EXECUTABLE_SYMBOL_INFO_TYPE, &kind)); + + if (kind == HSA_SYMBOL_KIND_KERNEL) { + const char* name = NULL; + uint32_t len = 0; + uint64_t obj = 0; + HSA_RT(hsa_executable_symbol_get_info(symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT, &obj)); + if (free_flag == 0) { + HSA_RT(hsa_executable_symbol_get_info(symbol, HSA_EXECUTABLE_SYMBOL_INFO_NAME_LENGTH, &len)); + char sym_name[len + 1]; + HSA_RT(hsa_executable_symbol_get_info(symbol, HSA_EXECUTABLE_SYMBOL_INFO_NAME, sym_name)); + name = cpp_demangle(sym_name); + } + + rocprofiler_hsa_callback_data_t data{}; + data.ksymbol.object = obj; + data.ksymbol.name = name; + data.ksymbol.name_length = len; + data.ksymbol.destroy = free_flag; + + ISSUE_HSA_CALLBACK(ROCPROFILER_HSA_CB_ID_KSYMBOL); + } + + return HSA_STATUS_SUCCESS; + } + static hsa_status_t ExecutableFreeze( hsa_executable_t executable, const char *options) @@ -352,6 +395,15 @@ class HsaInterceptor { reinterpret_cast(0)); } + { + IS_HSA_CALLBACK(ROCPROFILER_HSA_CB_ID_KSYMBOL) { + HSA_RT(hsa_executable_iterate_symbols( + executable, + KernelSymbolCallback, + reinterpret_cast(0))); + } + } + return status; } @@ -367,6 +419,15 @@ class HsaInterceptor { reinterpret_cast(1)); } + { + IS_HSA_CALLBACK(ROCPROFILER_HSA_CB_ID_KSYMBOL) { + HSA_RT(hsa_executable_iterate_symbols( + executable, + KernelSymbolCallback, + reinterpret_cast(1))); + } + } + HSA_RT(hsa_executable_destroy_fn(executable)); return status; diff --git a/src/core/intercept_queue.cpp b/src/core/intercept_queue.cpp index 0b309d63..705fff29 100644 --- a/src/core/intercept_queue.cpp +++ b/src/core/intercept_queue.cpp @@ -42,4 +42,7 @@ InterceptQueue::queue_id_t InterceptQueue::current_queue_id = 0; rocprofiler_hsa_callback_fun_t InterceptQueue::submit_callback_fun_ = NULL; void* InterceptQueue::submit_callback_arg_ = NULL; +bool InterceptQueue::opt_mode_ = false; +uint32_t InterceptQueue::k_concurrent_ = K_CONC_OFF; +std::once_flag InterceptQueue::once_flag_; } // namespace rocprofiler diff --git a/src/core/intercept_queue.h b/src/core/intercept_queue.h index a52d8c1d..5cd09b10 100644 --- a/src/core/intercept_queue.h +++ b/src/core/intercept_queue.h @@ -24,7 +24,6 @@ THE SOFTWARE. #define _SRC_CORE_INTERCEPT_QUEUE_H #include -#include #include #include @@ -41,9 +40,28 @@ THE SOFTWARE. #include "util/hsa_rsrc_factory.h" namespace rocprofiler { +enum { + K_CONC_OFF = 0, + K_CONC_PMC = 1, + K_CONC_TRACE = 2 +}; + extern decltype(hsa_queue_create)* hsa_queue_create_fn; extern decltype(hsa_queue_destroy)* hsa_queue_destroy_fn; +void PmcStarter(Context* context); + +static std::mutex ctx_a_mutex; +typedef std::map ctx_a_map_t; +static ctx_a_map_t* ctx_a_map = NULL; +static bool ck_ctx_inactive(Context* context) { + std::lock_guard lock(ctx_a_mutex); + if (ctx_a_map == NULL) ctx_a_map = new ctx_a_map_t; + auto ret = ctx_a_map->insert({context, true}); + if (ret.second == false) ctx_a_map->erase(context); + return ret.second; +} + class InterceptQueue { public: typedef std::recursive_mutex mutex_t; @@ -79,7 +97,13 @@ class InterceptQueue { if (!obj_map_) obj_map_ = new obj_map_t; InterceptQueue* obj = new InterceptQueue(agent, *queue, proxy); (*obj_map_)[(uint64_t)(*queue)] = obj; - status = proxy->SetInterceptCB(OnSubmitCB, obj); + if (k_concurrent_ == K_CONC_TRACE) { + status = proxy->SetInterceptCB(OnSubmitCB_ctrace, obj); + } else if (opt_mode_) { + status = proxy->SetInterceptCB(OnSubmitCB_opt, obj); + } else { + status = proxy->SetInterceptCB(OnSubmitCB, obj); + } obj->queue_event_callback_ = callback; obj->queue_id = current_queue_id; ++current_queue_id; @@ -123,6 +147,77 @@ class InterceptQueue { return status; } + static void OnSubmitCB_opt(const void* in_packets, uint64_t count, uint64_t user_que_idx, void* data, + hsa_amd_queue_intercept_packet_writer writer) { + const packet_t* packets_arr = reinterpret_cast(in_packets); + InterceptQueue* obj = reinterpret_cast(data); + Queue* proxy = obj->proxy_; + + // Travers input packets + for (uint64_t j = 0; j < count; ++j) { + const packet_t* packet = &packets_arr[j]; + bool to_submit = true; + + // Checking for dispatch packet type + if ((GetHeaderType(packet) == HSA_PACKET_TYPE_KERNEL_DISPATCH) && + (dispatch_callback_.load(std::memory_order_acquire) != NULL)) { + const hsa_kernel_dispatch_packet_t* dispatch_packet = + reinterpret_cast(packet); + const hsa_signal_t completion_signal = dispatch_packet->completion_signal; + + rocprofiler_callback_data_t data = {obj->agent_info_->dev_id, + obj->agent_info_->dev_index, + obj->queue_, + user_que_idx, + obj->queue_id, + completion_signal, + dispatch_packet, + NULL, // kernel_name + 0, // kernel_object + NULL, // kernel_code + 0, // (uint32_t)syscall(__NR_gettid), + NULL}; // record + + // Calling dispatch callback + rocprofiler_group_t group = {}; + hsa_status_t status = (dispatch_callback_.load())(&data, callback_data_, &group); + Context* context = reinterpret_cast(group.context); + // Injecting profiling start/stop packets + if ((status == HSA_STATUS_SUCCESS) && (context != NULL)) { + if (group.feature_count != 0) { + if (tracker_ != NULL) { + const_cast(dispatch_packet)->completion_signal = context->GetDispatchSignal(); + Group* context_group = context->GetGroup(group.index); + Tracker::Enable_opt(context_group, completion_signal); + context_group->IncrRefsCount(); + } + + const pkt_vector_t& start_vector = context->StartPackets(group.index); + const pkt_vector_t& stop_vector = context->StopPackets(group.index); + pkt_vector_t packets = start_vector; + packets.insert(packets.end(), *packet); + packets.insert(packets.end(), stop_vector.begin(), stop_vector.end()); + if (writer != NULL) { + writer(&packets[0], packets.size()); + } else { + proxy->Submit(&packets[0], packets.size()); + } + to_submit = false; + } + } + } + + // Submitting the original packets if profiling was not enabled + if (to_submit) { + if (writer != NULL) { + writer(packet, 1); + } else { + proxy->Submit(packet, 1); + } + } + } + } + static void OnSubmitCB(const void* in_packets, uint64_t count, uint64_t user_que_idx, void* data, hsa_amd_queue_intercept_packet_writer writer) { const packet_t* packets_arr = reinterpret_cast(in_packets); @@ -202,7 +297,6 @@ class InterceptQueue { // Calling dispatch callback rocprofiler_group_t group = {}; hsa_status_t status = (dispatch_callback_.load())(&data, callback_data_, &group); - free(const_cast(kernel_name)); // Injecting profiling start/stop packets if ((status != HSA_STATUS_SUCCESS) || (group.context == NULL)) { if (tracker_entry != NULL) { @@ -221,9 +315,27 @@ class InterceptQueue { const pkt_vector_t& start_vector = context->StartPackets(group.index); const pkt_vector_t& stop_vector = context->StopPackets(group.index); - pkt_vector_t packets = start_vector; - packets.insert(packets.end(), *packet); - packets.insert(packets.end(), stop_vector.begin(), stop_vector.end()); + const pkt_vector_t& read_vector = context->ReadPackets(group.index); + pkt_vector_t packets; + + if (k_concurrent_ == K_CONC_OFF) { // serial + packets = start_vector; + packets.insert(packets.end(), *packet); + packets.insert(packets.end(), stop_vector.begin(), stop_vector.end()); + } else { // concurrent + // Atrt PMC once + std::call_once(once_flag_, PmcStarter, context); + // Reads at both kernel start and end + assert(read_vector.size() == 2 * start_vector.size()); + auto mid = read_vector.begin() + read_vector.size()/2; + // Read at kernel start + packets.insert(packets.end(), read_vector.begin(), mid); + // Kernel dispatch packet + packets.insert(packets.end(), *packet); + // Read at kernel end + packets.insert(packets.end(), mid, read_vector.end()); + } + if (writer != NULL) { writer(&packets[0], packets.size()); } else { @@ -251,6 +363,110 @@ class InterceptQueue { } } + static void OnSubmitCB_ctrace(const void* in_packets, uint64_t count, uint64_t user_que_idx, void* data, + hsa_amd_queue_intercept_packet_writer writer) { + const packet_t* packets_arr = reinterpret_cast(in_packets); + InterceptQueue* obj = reinterpret_cast(data); + Queue* proxy = obj->proxy_; + + if (submit_callback_fun_) { + mutex_.lock(); + auto* callback_fun = submit_callback_fun_; + void* callback_arg = submit_callback_arg_; + mutex_.unlock(); + + if (callback_fun) { + for (uint64_t j = 0; j < count; ++j) { + const packet_t* packet = &packets_arr[j]; + const hsa_kernel_dispatch_packet_t* dispatch_packet = + reinterpret_cast(packet); + + const char* kernel_name = NULL; + if (GetHeaderType(packet) == HSA_PACKET_TYPE_KERNEL_DISPATCH) { + uint64_t kernel_object = dispatch_packet->kernel_object; + const amd_kernel_code_t* kernel_code = GetKernelCode(kernel_object); + kernel_name = (GetHeaderType(packet) == HSA_PACKET_TYPE_KERNEL_DISPATCH) ? + QueryKernelName(kernel_object, kernel_code) : NULL; + } + + // Prepareing submit callback data + rocprofiler_hsa_callback_data_t data{}; + data.submit.packet = (void*)packet; + data.submit.kernel_name = kernel_name; + data.submit.queue = obj->queue_; + data.submit.device_type = obj->agent_info_->dev_type; + data.submit.device_id = obj->agent_info_->dev_index; + + callback_fun(ROCPROFILER_HSA_CB_ID_SUBMIT, &data, callback_arg); + } + } + } + + // Travers input packets + for (uint64_t j = 0; j < count; ++j) { + const packet_t* packet = &packets_arr[j]; + bool to_submit = true; + + // Checking for dispatch packet type + if ((GetHeaderType(packet) == HSA_PACKET_TYPE_KERNEL_DISPATCH) && + (dispatch_callback_.load(std::memory_order_acquire) != NULL)) { + const hsa_kernel_dispatch_packet_t* dispatch_packet = + reinterpret_cast(packet); + const hsa_signal_t completion_signal = dispatch_packet->completion_signal; + + // Prepareing dispatch callback data + uint64_t kernel_object = dispatch_packet->kernel_object; + const amd_kernel_code_t* kernel_code = GetKernelCode(kernel_object); + const char* kernel_name = QueryKernelName(kernel_object, kernel_code); + + rocprofiler_callback_data_t data = {obj->agent_info_->dev_id, + obj->agent_info_->dev_index, + obj->queue_, + user_que_idx, + obj->queue_id, + completion_signal, + dispatch_packet, + kernel_name, + kernel_object, + kernel_code, + (uint32_t)syscall(__NR_gettid), + NULL}; + + // Calling dispatch callback + rocprofiler_group_t group = {}; + hsa_status_t status = (dispatch_callback_.load())(&data, callback_data_, &group); + + // Injecting profiling start/stop packets + if ((status == HSA_STATUS_SUCCESS) && (group.context != NULL)) { + Context* context = reinterpret_cast(group.context); + const bool ctx_inactive = ck_ctx_inactive(context); + + const pkt_vector_t& start_vector = context->StartPackets(group.index); + const pkt_vector_t& stop_vector = context->StopPackets(group.index); + pkt_vector_t packets; + if (ctx_inactive) packets = start_vector; + packets.insert(packets.end(), *packet); + if (!ctx_inactive) packets.insert(packets.end(), stop_vector.begin(), stop_vector.end()); + if (writer != NULL) { + writer(&packets[0], packets.size()); + } else { + proxy->Submit(&packets[0], packets.size()); + } + to_submit = false; + } + } + + // Submitting the original packets if profiling was not enabled + if (to_submit) { + if (writer != NULL) { + writer(packet, 1); + } else { + proxy->Submit(packet, 1); + } + } + } + } + static void SetCallbacks(rocprofiler_queue_callbacks_t callbacks, void* data) { std::lock_guard lck(mutex_); if (callback_data_ != NULL) { @@ -279,6 +495,9 @@ class InterceptQueue { static void TrackerOn(bool on) { tracker_on_ = on; } static bool IsTrackerOn() { return tracker_on_; } + static bool opt_mode_; + static uint32_t k_concurrent_; + private: static void queue_event_callback(hsa_status_t status, hsa_queue_t *queue, void *arg) { if (status != HSA_STATUS_SUCCESS) EXC_ABORT(status, "queue error handling is not supported"); @@ -309,14 +528,6 @@ class InterceptQueue { return (dbg_info != NULL) ? dbg_info->kernel_name : NULL; } - // Demangle C++ symbol name - static const char* cpp_demangle(const char* symname) { - size_t size = 0; - int status; - const char* ret = abi::__cxa_demangle(symname, NULL, &size, &status); - return (ret != 0) ? ret : strdup(symname); - } - static const char* QueryKernelName(uint64_t kernel_object, const amd_kernel_code_t* kernel_code) { const uint16_t kernel_object_flag = *((uint64_t*)kernel_code + 1); if (kernel_object_flag == 0) { @@ -327,7 +538,7 @@ class InterceptQueue { const char* kernel_symname = (util::HsaRsrcFactory::IsExecutableTracking()) ? util::HsaRsrcFactory::GetKernelNameRef(kernel_object) : GetKernelName(kernel_code->runtime_loader_kernel_symbol); - return cpp_demangle(kernel_symname); + return kernel_symname; } // method to get an intercept queue object @@ -391,6 +602,8 @@ class InterceptQueue { const util::AgentInfo* agent_info_; queue_event_callback_t queue_event_callback_; queue_id_t queue_id; + + static std::once_flag once_flag_; }; } // namespace rocprofiler diff --git a/src/core/profile.h b/src/core/profile.h index 9ed03375..f6165d07 100644 --- a/src/core/profile.h +++ b/src/core/profile.h @@ -119,7 +119,34 @@ class Profile { virtual void Insert(const profile_info_t& info) { info_vector_.push_back(info.rinfo); } - hsa_status_t Finalize(pkt_vector_t& start_vector, pkt_vector_t& stop_vector, pkt_vector_t& read_vector) { + void SetConcurrent(profile_t* profile) { + // Check whether conconcurrent has been set + for (const parameter_t* p = profile->parameters; + p < (profile->parameters + profile->parameter_count); ++p) { + // If yes, stop here + if (p->parameter_name == HSA_VEN_AMD_AQLPROFILE_PARAMETER_NAME_K_CONCURRENT) { + return; + } + } + + // Otherwise, try to set + parameter_t* parameters = new parameter_t[profile->parameter_count+1]; + for (unsigned i = 0; i < profile->parameter_count; ++i) { + parameters[i].parameter_name = profile->parameters[i].parameter_name; + parameters[i].value = profile->parameters[i].value; + } + if (profile->parameters) free(const_cast(profile->parameters)); + parameters[profile->parameter_count].parameter_name = + HSA_VEN_AMD_AQLPROFILE_PARAMETER_NAME_K_CONCURRENT; + parameters[profile->parameter_count].value = 1; + profile->parameters = parameters; + profile->parameter_count += 1; + } + + hsa_status_t Finalize(pkt_vector_t& start_vector, pkt_vector_t& stop_vector, + pkt_vector_t& read_vector, bool is_concurrent = false) { + if (is_concurrent) SetConcurrent(&profile_); + hsa_status_t status = HSA_STATUS_SUCCESS; if (!info_vector_.empty()) { @@ -127,11 +154,14 @@ class Profile { const pfn_t* api = rsrc->AqlProfileApi(); packet_t start{}; packet_t stop{}; - packet_t read{}; + packet_t read{}; // read at kernel start + packet_t read2{}; // read at kernel end // Check the profile buffer sizes status = api->hsa_ven_amd_aqlprofile_start(&profile_, NULL); if (status != HSA_STATUS_SUCCESS) AQL_EXC_RAISING(status, "aqlprofile_start(NULL)"); + // Double output buffer size if concurrent + if (is_concurrent) profile_.output_buffer.size *= 2; status = Allocate(rsrc); if (status != HSA_STATUS_SUCCESS) AQL_EXC_RAISING(status, "Allocate()"); @@ -144,21 +174,28 @@ class Profile { #ifdef AQLPROF_NEW_API if (profile_.type == HSA_VEN_AMD_AQLPROFILE_EVENT_TYPE_PMC) { rd_status = api->hsa_ven_amd_aqlprofile_read(&profile_, &read); + if (is_concurrent){ // concurrent: one more read + if (rd_status != HSA_STATUS_SUCCESS) AQL_EXC_RAISING(status, "aqlprofile_read"); + rd_status = api->hsa_ven_amd_aqlprofile_read(&profile_, &read2); + } } #if 0 // Read API returns error if disabled if (rd_status != HSA_STATUS_SUCCESS) AQL_EXC_RAISING(status, "aqlprofile_read"); #endif #endif - // Set completion signal + // Set completion signal of start hsa_signal_t dummy_signal{}; dummy_signal.handle = 0; start.completion_signal = dummy_signal; + + // Set completion signal of read/stop hsa_signal_t post_signal; status = hsa_signal_create(1, 0, NULL, &post_signal); if (status != HSA_STATUS_SUCCESS) EXC_RAISING(status, "signal_create " << std::hex << status); stop.completion_signal = post_signal; read.completion_signal = post_signal; + read2.completion_signal = post_signal; completion_signal_ = post_signal; // Fill packet vectors @@ -180,18 +217,24 @@ class Profile { AQL_EXC_RAISING(status, "hsa_ven_amd_aqlprofile_legacy_get_pm4"); if (rd_status == HSA_STATUS_SUCCESS) { - const uint32_t read_index = read_vector.size(); - read_vector.insert(read_vector.end(), LEGACY_SLOT_SIZE_PKT, packet_t{}); - status = api->hsa_ven_amd_aqlprofile_legacy_get_pm4( - &read, reinterpret_cast(&read_vector[read_index])); - if (status != HSA_STATUS_SUCCESS) - AQL_EXC_RAISING(status, "hsa_ven_amd_aqlprofile_legacy_get_pm4"); + pkt_vector_t reads = {read}; + if (is_concurrent) reads.push_back(read2); + for (auto rd : reads) { + const uint32_t read_index = read_vector.size(); + read_vector.insert(read_vector.end(), LEGACY_SLOT_SIZE_PKT, packet_t{}); + status = api->hsa_ven_amd_aqlprofile_legacy_get_pm4( + &rd, reinterpret_cast(&read_vector[read_index])); + if (status != HSA_STATUS_SUCCESS) + AQL_EXC_RAISING(status, "hsa_ven_amd_aqlprofile_legacy_get_pm4"); + } } } else { start_vector.push_back(start); stop_vector.push_back(stop); if (rd_status == HSA_STATUS_SUCCESS) { read_vector.push_back(read); + if (is_concurrent) + read_vector.push_back(read2); } } } @@ -237,46 +280,6 @@ class PmcProfile : public Profile { } }; -class TraceProfile : public Profile { - public: - static inline void SetSize(const uint32_t& size) { output_buffer_size_ = size; } - static inline uint32_t GetSize() { return output_buffer_size_; } - static inline void SetLocal(const bool& b) { output_buffer_local_ = b; } - static inline bool IsLocal() { return output_buffer_local_; } - - TraceProfile(const util::AgentInfo* agent_info) : Profile(agent_info) { - profile_.type = HSA_VEN_AMD_AQLPROFILE_EVENT_TYPE_TRACE; - } - - void Insert(const profile_info_t& info) { - if (info.parameters != NULL) { - Profile::Insert(info); - for (unsigned j = 0; j < info.parameter_count; ++j) { - Config(&profile_).Insert(info.parameters[j]); - } - } else if (info.event != NULL) { - Config(&profile_).Insert(*(info.event)); - } else { - EXC_ABORT(HSA_STATUS_ERROR, "invalid trace info inserted"); - } - } - - hsa_status_t Allocate(util::HsaRsrcFactory* rsrc) { - profile_.command_buffer.ptr = - rsrc->AllocateSysMemory(agent_info_, profile_.command_buffer.size); - profile_.output_buffer.size = output_buffer_size_; - profile_.output_buffer.ptr = (output_buffer_local_) ? - rsrc->AllocateLocalMemory(agent_info_, profile_.output_buffer.size) : - rsrc->AllocateSysMemory(agent_info_, profile_.output_buffer.size); - return (profile_.command_buffer.ptr && profile_.output_buffer.ptr) ? HSA_STATUS_SUCCESS - : HSA_STATUS_ERROR; - } - - private: - static uint32_t output_buffer_size_; - static bool output_buffer_local_; -}; - } // namespace rocprofiler #endif // SRC_CORE_PROFILE_H_ diff --git a/src/core/rocprofiler.cpp b/src/core/rocprofiler.cpp index 618edf23..e53d7257 100644 --- a/src/core/rocprofiler.cpp +++ b/src/core/rocprofiler.cpp @@ -150,6 +150,20 @@ void RestoreHsaApi() { table->amd_ext_->hsa_amd_queue_intercept_register_fn = hsa_amd_queue_intercept_register_fn; } +void PmcStarter(Context* context) { + hsa_agent_t agent = context->GetAgent(); + // Create queue + hsa_queue_t* queue; + hsa_status_t status = rocprofiler::CreateQueuePro(agent, 1, + HSA_QUEUE_TYPE_MULTI, NULL, NULL, UINT32_MAX, UINT32_MAX, &queue); + if (status != HSA_STATUS_SUCCESS) EXC_RAISING(status, "CreateQueuePro"); + HsaQueue hsa_queue(NULL, queue); + context->Start(0, &hsa_queue); + context->Read(0, &hsa_queue); + context->GetData(0); + hsa_queue_destroy(queue); +} + void StandaloneIntercept() { ::HsaApiTable* table = kHsaApiTable; table->core_->hsa_queue_create_fn = rocprofiler::CreateQueuePro; @@ -199,8 +213,6 @@ uint32_t LoadTool() { rocprofiler_settings_t settings{}; settings.intercept_mode = (intercept_mode != 0) ? 1 : 0; - settings.trace_size = TraceProfile::GetSize(); - settings.trace_local = TraceProfile::IsLocal() ? 1: 0; settings.timeout = util::HsaRsrcFactory::GetTimeoutNs(); settings.timestamp_on = InterceptQueue::IsTrackerOn() ? 1 : 0; settings.code_obj_tracking = 1; @@ -208,14 +220,17 @@ uint32_t LoadTool() { if (handler) handler(); else if (handler_prop) handler_prop(&settings); - TraceProfile::SetSize(settings.trace_size); - TraceProfile::SetLocal(settings.trace_local != 0); util::HsaRsrcFactory::SetTimeoutNs(settings.timeout); InterceptQueue::TrackerOn(settings.timestamp_on != 0); if (settings.intercept_mode != 0) intercept_mode = DISPATCH_INTERCEPT_MODE; if (settings.code_obj_tracking) intercept_mode |= CODE_OBJ_TRACKING_MODE; if (settings.memcopy_tracking) intercept_mode |= MEMCOPY_INTERCEPT_MODE; if (settings.hsa_intercepting) intercept_mode |= HSA_INTERCEPT_MODE; + if (settings.k_concurrent) { + Context::k_concurrent_ = settings.k_concurrent; + InterceptQueue::k_concurrent_ = settings.k_concurrent; + } + if (settings.opt_mode) InterceptQueue::opt_mode_ = true; } ONLOAD_TRACE("end intercept_mode(" << intercept_mode << ")"); @@ -418,8 +433,6 @@ hsa_status_t hsa_amd_memory_async_copy_rect_interceptor( } rocprofiler_properties_t rocprofiler_properties; -uint32_t TraceProfile::output_buffer_size_ = 0x2000000; // 32M -bool TraceProfile::output_buffer_local_ = true; std::atomic Tracker::instance_{}; Tracker::mutex_t Tracker::glob_mutex_; Tracker::counter_t Tracker::counter_ = 0; @@ -427,6 +440,8 @@ util::Logger::mutex_t util::Logger::mutex_; std::atomic util::Logger::instance_{}; } +CONTEXT_INSTANTIATE(); + /////////////////////////////////////////////////////////////////////////////////////////////////// // Public library methods // @@ -536,8 +551,7 @@ PUBLIC_API hsa_status_t rocprofiler_open(hsa_agent_t agent, rocprofiler_feature_ if (mode != 0) { if (mode & ROCPROFILER_MODE_STANDALONE) { if (mode & ROCPROFILER_MODE_CREATEQUEUE) { - if (hsa_rsrc->CreateQueue(agent_info, properties->queue_depth, &(properties->queue)) == - false) { + if (hsa_rsrc->CreateQueue(agent_info, properties->queue_depth, &(properties->queue)) == false) { EXC_RAISING(HSA_STATUS_ERROR, "CreateQueue() failed"); } } @@ -591,7 +605,7 @@ PUBLIC_API hsa_status_t rocprofiler_get_group(rocprofiler_t* handle, uint32_t gr rocprofiler_group_t* group) { API_METHOD_PREFIX rocprofiler::Context* context = reinterpret_cast(handle); - *group = context->GetGroupInfo(group_index); + *group = context->GetGroupDescr(group_index); API_METHOD_SUFFIX } @@ -692,12 +706,7 @@ PUBLIC_API hsa_status_t rocprofiler_stop_queue_callbacks() { // Method for iterating the events output data PUBLIC_API hsa_status_t rocprofiler_iterate_trace_data( - rocprofiler_t* handle, hsa_ven_amd_aqlprofile_data_callback_t callback, void* data) { - API_METHOD_PREFIX - rocprofiler::Context* context = reinterpret_cast(handle); - context->IterateTraceData(callback, data); - API_METHOD_SUFFIX -} + rocprofiler_t* handle, hsa_ven_amd_aqlprofile_data_callback_t callback, void* data) {} //////////////////////////////////////////////////////////////////////////////// // Open profiling pool diff --git a/src/core/tracker.h b/src/core/tracker.h index 823dc17d..d538aff7 100644 --- a/src/core/tracker.h +++ b/src/core/tracker.h @@ -155,6 +155,49 @@ class Tracker { Enable(entry, reinterpret_cast(handler), arg); } + // Enable tracking + static void Enable_opt(Group* group, const hsa_signal_t& orig_signal) { + Context* context = group->GetContext(); + context->SetOrigSignal(orig_signal); + context->GetRecord()->dispatch = util::HsaRsrcFactory::Instance().TimestampNs(); + + // Creating a proxy signal + const hsa_signal_value_t signal_value = (orig_signal.handle) ? + util::HsaRsrcFactory::Instance().HsaApi()->hsa_signal_load_relaxed(orig_signal) : 1; + hsa_signal_t& dispatch_signal = context->GetDispatchSignal(); + util::HsaRsrcFactory::Instance().HsaApi()->hsa_signal_store_screlease(dispatch_signal, signal_value); + hsa_status_t status = + util::HsaRsrcFactory::Instance().HsaApi()->hsa_amd_signal_async_handler(dispatch_signal, HSA_SIGNAL_CONDITION_LT, signal_value, Handler_opt, group); + if (status != HSA_STATUS_SUCCESS) EXC_RAISING(status, "hsa_amd_signal_async_handler"); + } + + // Tracker handler + static bool Handler_opt(hsa_signal_value_t signal_value, void* arg) { + Group* group = reinterpret_cast(arg); + Context* context = group->GetContext(); + hsa_signal_t dispatch_signal = context->GetDispatchSignal(); + record_t* record = context->GetRecord(); + hsa_amd_profiling_dispatch_time_t dispatch_time{}; + hsa_status_t status = + util::HsaRsrcFactory::Instance().HsaApi()->hsa_amd_profiling_get_dispatch_time(context->GetAgent(), dispatch_signal, &dispatch_time); + if (status != HSA_STATUS_SUCCESS) EXC_RAISING(status, "hsa_amd_profiling_get_dispatch_time"); + record->begin = util::HsaRsrcFactory::Instance().SysclockToNs(dispatch_time.start); + record->end = util::HsaRsrcFactory::Instance().SysclockToNs(dispatch_time.end); + record->complete = util::HsaRsrcFactory::Instance().TimestampNs(); + + // Original intercepted signal completion + const hsa_signal_t& orig_signal = context->GetOrigSignal(); + if (orig_signal.handle) { + amd_signal_t* orig_signal_ptr = reinterpret_cast(orig_signal.handle); + amd_signal_t* prof_signal_ptr = reinterpret_cast(dispatch_signal.handle); + orig_signal_ptr->start_ts = prof_signal_ptr->start_ts; + orig_signal_ptr->end_ts = prof_signal_ptr->end_ts; + util::HsaRsrcFactory::Instance().HsaApi()->hsa_signal_store_screlease(orig_signal, signal_value); + } + + return Context::Handler(signal_value, arg); + } + private: Tracker() : outstanding_(0), diff --git a/src/util/hsa_rsrc_factory.cpp b/src/util/hsa_rsrc_factory.cpp index 78833284..e2f97ce9 100644 --- a/src/util/hsa_rsrc_factory.cpp +++ b/src/util/hsa_rsrc_factory.cpp @@ -36,6 +36,7 @@ POSSIBILITY OF SUCH DAMAGE. #include #include #include +#include #include #include @@ -626,6 +627,8 @@ bool HsaRsrcFactory::LoadAndFinalize(const AgentInfo* agent_info, const char* br &kernelSymbol); CHECK_STATUS("Error in looking up kernel symbol", status); + close(file_handle); + // Update output parameter *code_desc = kernelSymbol; return true; @@ -705,7 +708,7 @@ const char* HsaRsrcFactory::GetKernelNameRef(uint64_t addr) { std::lock_guard lck(mutex_); const auto it = symbols_map_->find(addr); if (it == symbols_map_->end()) { - fprintf(stderr, "HsaRsrcFactory::kernel addr (0x%lx) is not found\n", addr); + fprintf(stderr, "HsaRsrcFactory::GetKernelNameRef: kernel addr (0x%lx) is not found\n", addr); abort(); } return it->second; diff --git a/test/app/standalone_test.cpp b/test/app/standalone_test.cpp index 34bc05ea..7758daf2 100644 --- a/test/app/standalone_test.cpp +++ b/test/app/standalone_test.cpp @@ -146,19 +146,6 @@ int main() { // feature[8].name = "TCC_EA_WRREQ_sum"; // feature[9].kind = ROCPROFILER_FEATURE_KIND_METRIC; // feature[9].name = "TCC_EA_WRREQ_64B_sum"; -#if 0 - // Tracing parameters - const unsigned parameter_count = 2; - rocprofiler_parameter_t parameters[parameter_count]; - feature[2].name = "THREAD_TRACE"; - feature[2].kind = ROCPROFILER_FEATURE_KIND_TRACE; - feature[2].parameters = parameters; - feature[2].parameter_count = parameter_count; - parameters[0].parameter_name = HSA_VEN_AMD_AQLPROFILE_PARAMETER_NAME_MASK; - parameters[0].value = 0; - parameters[1].parameter_name = HSA_VEN_AMD_AQLPROFILE_PARAMETER_NAME_TOKEN_MASK; - parameters[1].value = 0; -#endif // Instantiate HSA resources HsaRsrcFactory::Create(); diff --git a/test/run.sh b/test/run.sh index 4c985d3e..8611d7ef 100755 --- a/test/run.sh +++ b/test/run.sh @@ -86,7 +86,7 @@ export HSA_TOOLS_LIB=librocprofiler64.so.1 # enable intercepting mode in rocprofiler export ROCP_HSA_INTERCEPT=2 # test macro for kernel iterations number -export ROCP_KITER=100 +export ROCP_KITER=20 # test macro for per-kernel dispatching number export ROCP_DITER=10 eval_test "Standalone intercepting test" ./test/stand_intercept_test @@ -95,8 +95,8 @@ unset ROCP_HSA_INTERCEPT ## Intercepting usage model test # tool library loaded by ROC profiler export ROCP_TOOL_LIB=./test/libintercept_test.so -export ROCP_KITER=50 -export ROCP_DITER=50 +export ROCP_KITER=20 +export ROCP_DITER=20 export ROCP_AGENTS=1 export ROCP_THRS=3 eval_test "Intercepting usage model test" ./test/ctrl @@ -107,59 +107,56 @@ export ROCP_TOOL_LIB=libtool.so # ROC profiler kernels timing export ROCP_TIMESTAMP_ON=1 # output directory for the tool library, for metrics results file 'results.txt' -# and SQTT trace files 'thread_trace.se.out' export ROCP_OUTPUT_DIR=./RESULTS if [ ! -e $ROCP_TOOL_LIB ] ; then export ROCP_TOOL_LIB=test/libtool.so fi -export ROCP_KITER=50 -export ROCP_DITER=50 +export ROCP_KITER=20 +export ROCP_DITER=20 export ROCP_AGENTS=1 export ROCP_THRS=1 -export ROCP_INPUT=input.xml -eval_test "'rocprof' libtool test" ./test/ctrl +export ROCP_INPUT=pmc_input.xml +eval_test "'rocprof' libtool PMC test" ./test/ctrl -export ROCP_KITER=10 -export ROCP_DITER=10 +export ROCP_KITER=20 +export ROCP_DITER=20 export ROCP_AGENTS=1 export ROCP_THRS=10 -export ROCP_INPUT=input1.xml -eval_test "'rocprof' libtool test n-threads" ./test/ctrl - -## SPM test -# export ROCP_KITER=3 -# export ROCP_DITER=3 -# export ROCP_AGENTS=1 -# export ROCP_THRS=1 -# export ROCP_INPUT=spm_input.xml -# export ROCP_SPM=1 -# eval_test "libtool test, SPM trace test" ./test/ctrl -# unset ROCP_SPM +export ROCP_INPUT=pmc_input.xml +eval_test "'rocprof' libtool PMC n-thread test" ./test/ctrl -## Libtool test, counter sets -# Memcopies tracking -export ROCP_MCOPY_TRACKING=1 +export ROCP_OPT_MODE=1 +export ROCP_KITER=20 +export ROCP_DITER=20 +export ROCP_AGENTS=1 +export ROCP_THRS=10 +export ROCP_INPUT=pmc_input.xml +eval_test "'rocprof' libtool PMC n-thread opt test" ./test/ctrl +unset ROCP_OPT_MODE -export ROCP_KITER=1 -export ROCP_DITER=4 -export ROCP_INPUT=input2.xml -eval_test "libtool test, counter sets" ./test/ctrl +export ROCP_KITER=20 +export ROCP_DITER=20 +export ROCP_AGENTS=1 +export ROCP_THRS=1 +export ROCP_INPUT=pmc_input1.xml +eval_test "'rocprof' libtool PMC test1" ./test/ctrl -## OpenCL test -#export ROCP_OBJ_TRACKING=1 -#export ROCP_INPUT=input1.xml -#eval_test "libtool test, OpenCL sample" ./test/ocl/SimpleConvolution +export ROCP_KITER=20 +export ROCP_DITER=20 +export ROCP_AGENTS=1 +export ROCP_THRS=10 +export ROCP_INPUT=pmc_input1.xml +eval_test "'rocprof' libtool PMC n-thread test1" ./test/ctrl -# Memcopies tracking -unset ROCP_MCOPY_TRACKING +## Libtool test, counter sets # enable HSA intercepting export ROCP_HSA_INTERC=1 export ROCP_KITER=10 export ROCP_DITER=10 -export ROCP_INPUT=input1.xml +#export ROCP_INPUT=input1.xml eval_test "libtool test, counter sets" ./test/ctrl ## OpenCL test diff --git a/test/tool/pmc_input.xml b/test/tool/pmc_input.xml new file mode 100644 index 00000000..6b9e3d6a --- /dev/null +++ b/test/tool/pmc_input.xml @@ -0,0 +1,4 @@ +# List of metrics + diff --git a/test/tool/pmc_input1.xml b/test/tool/pmc_input1.xml new file mode 100644 index 00000000..6863fa29 --- /dev/null +++ b/test/tool/pmc_input1.xml @@ -0,0 +1,14 @@ +# Filter by dispatches range, GPU index and kernel names + + +# List of metrics + diff --git a/test/tool/tool.cpp b/test/tool/tool.cpp index e216b7fd..6b2adf8a 100644 --- a/test/tool/tool.cpp +++ b/test/tool/tool.cpp @@ -27,6 +27,7 @@ THE SOFTWARE. /////////////////////////////////////////////////////////////////////////////// #include +#include #include #include #include @@ -99,6 +100,7 @@ struct context_entry_t { unsigned feature_count; rocprofiler_callback_data_t data; kernel_properties_t kernel_properties; + HsaRsrcFactory::symbols_map_it_t kernel_name_it; FILE* file_handle; }; @@ -143,8 +145,6 @@ static uint32_t CTX_OUTSTANDING_MON = 0; uint32_t to_truncate_names = 0; // local trace buffer bool is_trace_local = true; -// SPM trace enabled -bool is_spm_trace = false; static inline uint32_t GetPid() { return syscall(__NR_getpid); } static inline uint32_t GetTid() { return syscall(__NR_gettid); } @@ -169,6 +169,21 @@ void check_status(hsa_status_t status) { } } +////////////////////////////////////////////////////////////////////////////////////// +// Dispatch opt code ///////////////////////////////////////////////////////////////// +////////////////////////////////////////////////////////////////////////////////////// +// Context callback arg +struct callbacks_arg_t { + rocprofiler_pool_t** pools; +}; + +// Handler callback arg +struct handler_arg_t { + rocprofiler_feature_t* features; + unsigned feature_count; +}; + +/////////////////////////////////////////////////////////////////////////////////////////////////////// // Print profiling results output break if terminal output is enabled void results_output_break() { const bool is_terminal_output = (result_file_opened == false); @@ -289,54 +304,18 @@ void dealloc_context_entry(context_entry_t* entry) { } } -// Dump trace data to file -void dump_sqtt_trace(const char* label, const uint32_t chunk, const void* data, const uint32_t& size) { - if (result_prefix != NULL) { - // Open file - std::ostringstream oss; - oss << result_prefix << "/thread_trace_" << label << "_se" << chunk << ".out"; - FILE* file = fopen(oss.str().c_str(), "w"); - if (file == NULL) { - std::ostringstream errmsg; - errmsg << "fopen error, file '" << oss.str().c_str() << "'"; - perror(errmsg.str().c_str()); - abort(); - } - - // Write the buffer in terms of shorts (16 bits) - const unsigned short* ptr = reinterpret_cast(data); - for (uint32_t i = 0; i < (size / sizeof(short)); ++i) { - fprintf(file, "%04x\n", ptr[i]); - } - - // Close file - fclose(file); - } -} - -// Dump trace data to file -void dump_spm_trace(const char* label, const void* data, const uint32_t& size) { - if (result_prefix != NULL) { - // Open trace file - std::ostringstream oss; - oss << result_prefix << "/spm_trace_" << label << ".out"; - const int fd = open(oss.str().c_str(), O_CREAT|O_WRONLY|O_TRUNC, 0666); - if (fd == -1) { - std::ostringstream errmsg; - errmsg << "open error, file '" << oss.str().c_str() << "'"; - perror(errmsg.str().c_str()); - abort(); - } - // write trace binary data - if (write(fd, data, size) == -1) { - std::ostringstream errmsg; - errmsg << "write error, file '" << oss.str().c_str() << "'"; - perror(errmsg.str().c_str()); - abort(); - } - // Close file - close(fd); - } +// Global context map +static std::mutex ctx_a_mutex; +typedef std::map ctx_a_map_t; +ctx_a_map_t* ctx_a_map = NULL; +context_entry_t* ck_ctx_entry(hsa_agent_t agent, bool& found) { + std::lock_guard lock(ctx_a_mutex); + if (ctx_a_map == NULL) ctx_a_map = new ctx_a_map_t; + auto ret = ctx_a_map->insert({agent.handle, NULL}); + found = !ret.second; + if (found) ctx_a_map->erase(agent.handle); + else ret.first->second = new context_entry_t{}; + return ret.first->second; } struct trace_data_arg_t { @@ -345,54 +324,6 @@ struct trace_data_arg_t { hsa_agent_t agent; }; -// Trace data callback for getting trace data from GPU local memory -hsa_status_t trace_data_cb(hsa_ven_amd_aqlprofile_info_type_t info_type, - hsa_ven_amd_aqlprofile_info_data_t* info_data, void* data) { - hsa_status_t status = HSA_STATUS_SUCCESS; - trace_data_arg_t* arg = reinterpret_cast(data); - if (info_type == HSA_VEN_AMD_AQLPROFILE_INFO_TRACE_DATA) { - if (is_spm_trace) { - if (info_data->sample_id != 0) { - fatal("Only one SPM sample expected"); - } - const void* data_ptr = info_data->trace_data.ptr; - const uint32_t data_size = info_data->trace_data.size; - fprintf(arg->file, " size(%u)\n", data_size); - - if (is_trace_local == false) fatal("SPM trace supports only local trace allocation"); - HsaRsrcFactory* hsa_rsrc = &HsaRsrcFactory::Instance(); - const AgentInfo* agent_info = hsa_rsrc->GetAgentInfo(arg->agent); - const uint32_t mem_size = data_size; - void* buffer = hsa_rsrc->AllocateSysMemory(agent_info, mem_size); - if(!hsa_rsrc->Memcpy(agent_info, buffer, data_ptr, mem_size)) { - fatal("Trace data memcopy to host failed"); - } - dump_spm_trace(arg->label, buffer, data_size); - HsaRsrcFactory::FreeMemory(buffer); - } else { - const void* data_ptr = info_data->trace_data.ptr; - const uint32_t data_size = info_data->trace_data.size; - fprintf(arg->file, " SE(%u) size(%u)\n", info_data->sample_id, data_size); - - if (is_trace_local) { - HsaRsrcFactory* hsa_rsrc = &HsaRsrcFactory::Instance(); - const AgentInfo* agent_info = hsa_rsrc->GetAgentInfo(arg->agent); - const uint32_t mem_size = data_size; - void* buffer = hsa_rsrc->AllocateSysMemory(agent_info, mem_size); - if(!hsa_rsrc->Memcpy(agent_info, buffer, data_ptr, mem_size)) { - fatal("Trace data memcopy to host failed"); - } - dump_sqtt_trace(arg->label, info_data->sample_id, buffer, data_size); - HsaRsrcFactory::FreeMemory(buffer); - } else { - dump_sqtt_trace(arg->label, info_data->sample_id, data_ptr, data_size); - } - } - } else - status = HSA_STATUS_ERROR; - return status; -} - // Align to specified alignment unsigned align_size(unsigned size, unsigned alignment) { return ((size + alignment - 1) & ~(alignment - 1)); @@ -413,38 +344,7 @@ void output_results(const context_entry_t* entry, const char* label) { case ROCPROFILER_DATA_KIND_INT64: fprintf(file, "(%lu)\n", p->data.result_int64); break; - // Output trace results - case ROCPROFILER_DATA_KIND_BYTES: { - if (p->data.result_bytes.copy) { - uint64_t size = 0; - - const char* ptr = reinterpret_cast(p->data.result_bytes.ptr); - const char* end = reinterpret_cast(ptr + p->data.result_bytes.size); - for (unsigned i = 0; i < p->data.result_bytes.instance_count; ++i) { - const uint32_t chunk_size = *reinterpret_cast(ptr); - const char* chunk_data = ptr + sizeof(uint32_t); - if (chunk_data >= end) fatal("Trace data is out of the result buffer size"); - - dump_sqtt_trace(label, i, chunk_data, chunk_size); - const uint32_t off = align_size(chunk_size, sizeof(uint32_t)); - ptr = chunk_data + off; - if (chunk_data >= end) fatal("Trace data ptr is out of the result buffer size"); - size += chunk_size; - } - fprintf(file, "size(%lu)\n", size); - HsaRsrcFactory::FreeMemory(p->data.result_bytes.ptr); - const_cast(p)->data.result_bytes.size = 0; - } else { - fprintf(file, "(\n"); - trace_data_arg_t trace_data_arg{file, label, entry->agent}; - hsa_status_t status = rocprofiler_iterate_trace_data(context, trace_data_cb, reinterpret_cast(&trace_data_arg)); - check_status(status); - fprintf(file, " )\n"); - } - break; - } default: - if (is_spm_trace) continue; fprintf(stderr, "RPL-tool: undefined data kind(%u)\n", p->data.kind); abort(); } @@ -465,7 +365,7 @@ void output_group(const context_entry_t* entry, const char* label) { } // Dump stored context entry -bool dump_context_entry(context_entry_t* entry) { +bool dump_context_entry(context_entry_t* entry, bool to_clean = true) { hsa_status_t status = HSA_STATUS_ERROR; volatile std::atomic* valid = reinterpret_cast*>(&entry->valid); @@ -481,35 +381,36 @@ bool dump_context_entry(context_entry_t* entry) { ++context_collected; const uint32_t index = entry->index; - FILE* file_handle = entry->file_handle; - const std::string nik_name = (to_truncate_names == 0) ? entry->data.kernel_name : filtr_kernel_name(entry->data.kernel_name); - const AgentInfo* agent_info = HsaRsrcFactory::Instance().GetAgentInfo(entry->agent); - - fprintf(file_handle, "dispatch[%u], gpu-id(%u), queue-id(%u), queue-index(%lu), pid(%u), tid(%u), grd(%u), wgr(%u), lds(%u), scr(%u), vgpr(%u), sgpr(%u), fbar(%u), sig(0x%lx), kernel-name(\"%s\")", - index, - agent_info->dev_index, - entry->data.queue_id, - entry->data.queue_index, - my_pid, - entry->data.thread_id, - entry->kernel_properties.grid_size, - entry->kernel_properties.workgroup_size, - (entry->kernel_properties.lds_size + (AgentInfo::lds_block_size - 1)) & ~(AgentInfo::lds_block_size - 1), - entry->kernel_properties.scratch_size, - (entry->kernel_properties.vgpr_count + 1) * agent_info->vgpr_block_size, - (entry->kernel_properties.sgpr_count + agent_info->sgpr_block_dflt) * agent_info->sgpr_block_size, - entry->kernel_properties.fbarrier_count, - entry->kernel_properties.signal.handle, - nik_name.c_str()); - if (record) fprintf(file_handle, ", time(%lu,%lu,%lu,%lu)", - record->dispatch, - record->begin, - record->end, - record->complete); - fprintf(file_handle, "\n"); - fflush(file_handle); - - if (record) { + if (index != UINT32_MAX) { + FILE* file_handle = entry->file_handle; + const std::string nik_name = (to_truncate_names == 0) ? entry->data.kernel_name : filtr_kernel_name(entry->data.kernel_name); + const AgentInfo* agent_info = HsaRsrcFactory::Instance().GetAgentInfo(entry->agent); + + fprintf(file_handle, "dispatch[%u], gpu-id(%u), queue-id(%u), queue-index(%lu), pid(%u), tid(%u), grd(%u), wgr(%u), lds(%u), scr(%u), vgpr(%u), sgpr(%u), fbar(%u), sig(0x%lx), kernel-name(\"%s\")", + index, + agent_info->dev_index, + entry->data.queue_id, + entry->data.queue_index, + my_pid, + entry->data.thread_id, + entry->kernel_properties.grid_size, + entry->kernel_properties.workgroup_size, + (entry->kernel_properties.lds_size + (AgentInfo::lds_block_size - 1)) & ~(AgentInfo::lds_block_size - 1), + entry->kernel_properties.scratch_size, + (entry->kernel_properties.vgpr_count + 1) * agent_info->vgpr_block_size, + (entry->kernel_properties.sgpr_count + agent_info->sgpr_block_dflt) * agent_info->sgpr_block_size, + entry->kernel_properties.fbarrier_count, + entry->kernel_properties.signal.handle, + nik_name.c_str()); + if (record) fprintf(file_handle, ", time(%lu,%lu,%lu,%lu)", + record->dispatch, + record->begin, + record->end, + record->complete); + fprintf(file_handle, "\n"); + fflush(file_handle); + } + if (record && to_clean) { delete record; entry->data.record = NULL; } @@ -527,11 +428,11 @@ bool dump_context_entry(context_entry_t* entry) { std::ostringstream oss; oss << index << "__" << filtr_kernel_name(entry->data.kernel_name); output_results(entry, oss.str().substr(0, KERNEL_NAME_LEN_MAX).c_str()); - free(const_cast(entry->data.kernel_name)); + if (to_clean) free(const_cast(entry->data.kernel_name)); // Finishing cleanup // Deleting profiling context will delete all allocated resources - rocprofiler_close(group.context); + if (to_clean) rocprofiler_close(group.context); } return true; @@ -574,7 +475,6 @@ void dump_context_array(hsa_queue_t* queue) { // Profiling completion handler // Dump and delete the context entry -// Return true if the context was dumped successfully bool context_handler(rocprofiler_group_t group, void* arg) { context_entry_t* entry = reinterpret_cast(arg); @@ -606,6 +506,34 @@ bool context_handler(rocprofiler_group_t group, void* arg) { return false; } +// Profiling completion handler +// Dump context entry +bool context_pool_handler(const rocprofiler_pool_entry_t* entry, void* arg) { + // Context entry + context_entry_t* ctx_entry = reinterpret_cast(entry->payload); + handler_arg_t* handler_arg = reinterpret_cast(arg); + ctx_entry->features = handler_arg->features; + ctx_entry->feature_count = handler_arg->feature_count; + ctx_entry->data.kernel_name = ctx_entry->kernel_name_it->second.name; + ctx_entry->file_handle = result_file_handle; + + if (pthread_mutex_lock(&mutex) != 0) { + perror("pthread_mutex_lock"); + abort(); + } + + dump_context_entry(ctx_entry, false); + + if (pthread_mutex_unlock(&mutex) != 0) { + perror("pthread_mutex_unlock"); + abort(); + } + + HsaRsrcFactory::ReleaseKernelNameRef(ctx_entry->kernel_name_it); + + return false; +} + bool check_filter(const rocprofiler_callback_data_t* callback_data, const callbacks_data_t* tool_data) { bool found = true; @@ -641,12 +569,54 @@ bool check_filter(const rocprofiler_callback_data_t* callback_data, const callba return found; } +static const amd_kernel_code_t* GetKernelCode(uint64_t kernel_object) { + const amd_kernel_code_t* kernel_code = NULL; + hsa_status_t status = + HsaRsrcFactory::Instance().LoaderApi()->hsa_ven_amd_loader_query_host_address( + reinterpret_cast(kernel_object), + reinterpret_cast(&kernel_code)); + if (HSA_STATUS_SUCCESS != status) { + kernel_code = reinterpret_cast(kernel_object); + } + return kernel_code; +} + +// Setting kernel properties +void set_kernel_properties(const rocprofiler_callback_data_t* callback_data, + context_entry_t* entry) +{ + const hsa_kernel_dispatch_packet_t* packet = callback_data->packet; + kernel_properties_t* kernel_properties_ptr = &(entry->kernel_properties); + const amd_kernel_code_t* kernel_code = callback_data->kernel_code; + + entry->data = *callback_data; + + if (kernel_code == NULL) { + const uint64_t kernel_object = callback_data->packet->kernel_object; + kernel_code = GetKernelCode(kernel_object); + entry->kernel_name_it = HsaRsrcFactory::AcquireKernelNameRef(kernel_object); + } else { + entry->data.kernel_name = strdup(callback_data->kernel_name); + } + + uint64_t grid_size = packet->grid_size_x * packet->grid_size_y * packet->grid_size_z; + if (grid_size > UINT32_MAX) abort(); + kernel_properties_ptr->grid_size = (uint32_t)grid_size; + uint64_t workgroup_size = packet->workgroup_size_x * packet->workgroup_size_y * packet->workgroup_size_z; + if (workgroup_size > UINT32_MAX) abort(); + kernel_properties_ptr->workgroup_size = (uint32_t)workgroup_size; + kernel_properties_ptr->lds_size = packet->group_segment_size; + kernel_properties_ptr->scratch_size = packet->private_segment_size; + kernel_properties_ptr->vgpr_count = AMD_HSA_BITS_GET(kernel_code->compute_pgm_rsrc1, AMD_COMPUTE_PGM_RSRC_ONE_GRANULATED_WORKITEM_VGPR_COUNT); + kernel_properties_ptr->sgpr_count = AMD_HSA_BITS_GET(kernel_code->compute_pgm_rsrc1, AMD_COMPUTE_PGM_RSRC_ONE_GRANULATED_WAVEFRONT_SGPR_COUNT); + kernel_properties_ptr->fbarrier_count = kernel_code->workgroup_fbarrier_count; + kernel_properties_ptr->signal = callback_data->completion_signal; +} + // Kernel disoatch callback hsa_status_t dispatch_callback(const rocprofiler_callback_data_t* callback_data, void* user_data, rocprofiler_group_t* group) { // Passed tool data - const hsa_kernel_dispatch_packet_t* packet = callback_data->packet; - const amd_kernel_code_t* kernel_code = callback_data->kernel_code; callbacks_data_t* tool_data = reinterpret_cast(user_data); // HSA status hsa_status_t status = HSA_STATUS_ERROR; @@ -659,23 +629,10 @@ hsa_status_t dispatch_callback(const rocprofiler_callback_data_t* callback_data, } } // Profiling context - rocprofiler_t* context = NULL; // Context entry context_entry_t* entry = alloc_context_entry(); - // kernel properties - kernel_properties_t* kernel_properties_ptr = &(entry->kernel_properties); - uint64_t grid_size = packet->grid_size_x * packet->grid_size_y * packet->grid_size_z; - if (grid_size > UINT32_MAX) abort(); - kernel_properties_ptr->grid_size = (uint32_t)grid_size; - uint64_t workgroup_size = packet->workgroup_size_x * packet->workgroup_size_y * packet->workgroup_size_z; - if (workgroup_size > UINT32_MAX) abort(); - kernel_properties_ptr->workgroup_size = (uint32_t)workgroup_size; - kernel_properties_ptr->lds_size = packet->group_segment_size; - kernel_properties_ptr->scratch_size = packet->private_segment_size; - kernel_properties_ptr->vgpr_count = AMD_HSA_BITS_GET(kernel_code->compute_pgm_rsrc1, AMD_COMPUTE_PGM_RSRC_ONE_GRANULATED_WORKITEM_VGPR_COUNT); - kernel_properties_ptr->sgpr_count = AMD_HSA_BITS_GET(kernel_code->compute_pgm_rsrc1, AMD_COMPUTE_PGM_RSRC_ONE_GRANULATED_WAVEFRONT_SGPR_COUNT); - kernel_properties_ptr->fbarrier_count = kernel_code->workgroup_fbarrier_count; - kernel_properties_ptr->signal = callback_data->completion_signal; + // Setting kernel properties + set_kernel_properties(callback_data, entry); // context properties rocprofiler_properties_t properties{}; @@ -701,6 +658,7 @@ hsa_status_t dispatch_callback(const rocprofiler_callback_data_t* callback_data, } // Open profiling context + rocprofiler_t* context = NULL; status = rocprofiler_open(callback_data->agent, features, feature_count, &context, 0 /*ROCPROFILER_MODE_SINGLEGROUP*/, &properties); check_status(status); @@ -720,8 +678,6 @@ hsa_status_t dispatch_callback(const rocprofiler_callback_data_t* callback_data, entry->group = *group; entry->features = features; entry->feature_count = feature_count; - entry->data = *callback_data; - entry->data.kernel_name = strdup(callback_data->kernel_name); entry->file_handle = tool_data->file_handle; entry->active = true; reinterpret_cast*>(&entry->valid)->store(true); @@ -734,6 +690,35 @@ hsa_status_t dispatch_callback(const rocprofiler_callback_data_t* callback_data, return status; } +// Kernel disoatch callback +hsa_status_t dispatch_callback_opt(const rocprofiler_callback_data_t* callback_data, void* user_data, + rocprofiler_group_t* group) { + hsa_status_t status = HSA_STATUS_ERROR; + hsa_agent_t agent = callback_data->agent; + const unsigned gpu_id = HsaRsrcFactory::Instance().GetAgentInfo(agent)->dev_index; + callbacks_arg_t* callbacks_arg = reinterpret_cast(user_data); + rocprofiler_pool_t* pool = callbacks_arg->pools[gpu_id]; + rocprofiler_pool_entry_t pool_entry{}; + status = rocprofiler_pool_fetch(pool, &pool_entry); + check_status(status); + // Profiling context entry + rocprofiler_t* context = pool_entry.context; + context_entry_t* entry = reinterpret_cast(pool_entry.payload); + // Setting kernel properties + set_kernel_properties(callback_data, entry); + // Get group[0] + status = rocprofiler_get_group(context, 0, group); + check_status(status); + + // Fill profiling context entry + entry->index = UINT32_MAX; + entry->agent = agent; + entry->group = *group; + + reinterpret_cast*>(&entry->valid)->store(true); + return status; +} + hsa_status_t destroy_callback(hsa_queue_t* queue, void*) { results_output_break(); dump_context_array(queue); @@ -889,9 +874,19 @@ rocprofiler_hsa_callbacks_t hsa_callbacks { hsa_unified_callback, hsa_unified_callback, hsa_unified_callback, - hsa_unified_callback + hsa_unified_callback, + NULL }; +// HSA kernel symbol callback +hsa_status_t hsa_ksymbol_cb(rocprofiler_hsa_cb_id_t id, + const rocprofiler_hsa_callback_data_t* data, + void* arg) +{ + HsaRsrcFactory::SetKernelNameRef(data->ksymbol.object, data->ksymbol.name, data->ksymbol.destroy); + return HSA_STATUS_SUCCESS; +} + // Tool constructor extern "C" PUBLIC_API void OnLoadToolProp(rocprofiler_settings_t* settings) { @@ -979,6 +974,8 @@ extern "C" PUBLIC_API void OnLoadToolProp(rocprofiler_settings_t* settings) // Set HSA intercepting check_env_var("ROCP_HSA_INTERC", settings->hsa_intercepting); if (settings->hsa_intercepting) rocprofiler_set_hsa_callbacks(hsa_callbacks, (void*)14); + // Enable optmized mode + check_env_var("ROCP_OPT_MODE", settings->opt_mode); is_trace_local = settings->trace_local; @@ -1064,6 +1061,8 @@ extern "C" PUBLIC_API void OnLoadToolProp(rocprofiler_settings_t* settings) range_vec->push_back(*(range_vec->begin()) + 1); } + const bool filter_disabled = (gpu_index_vec->empty() && kernel_string_vec->empty() && range_vec->empty()); + // Getting traces const auto traces_list = xml->GetNodes("top.trace"); if (traces_list.size() > 1) fatal("ROCProfiler: only one trace supported at a time"); @@ -1087,26 +1086,79 @@ extern "C" PUBLIC_API void OnLoadToolProp(rocprofiler_settings_t* settings) // Context array aloocation context_array = new context_array_t; - // Adding dispatch observer - rocprofiler_queue_callbacks_t callbacks_ptrs{0}; - callbacks_ptrs.dispatch = dispatch_callback; - callbacks_ptrs.destroy = destroy_callback; - - callbacks_data = new callbacks_data_t{}; - callbacks_data->features = features; - callbacks_data->feature_count = features_found; - callbacks_data->set = (metrics_set->empty()) ? NULL : metrics_set; - callbacks_data->group_index = 0; - callbacks_data->file_handle = result_file_handle; - callbacks_data->gpu_index = (gpu_index_vec->empty()) ? NULL : gpu_index_vec; - callbacks_data->kernel_string = (kernel_string_vec->empty()) ? NULL : kernel_string_vec; - callbacks_data->range = (range_vec->empty()) ? NULL : range_vec;; - callbacks_data->filter_on = (callbacks_data->gpu_index != NULL) || - (callbacks_data->kernel_string != NULL) || - (callbacks_data->range != NULL) - ? 1 : 0; - - rocprofiler_set_queue_callbacks(callbacks_ptrs, callbacks_data); + bool opt_mode_cond = ((features_found != 0) && + (metrics_set->empty()) && + (traces_found == 0) && + (filter_disabled == true)); + if (settings->opt_mode == 0) opt_mode_cond = false; + if (!opt_mode_cond) settings->opt_mode = 0; + if (opt_mode_cond) { + // Handler arg + handler_arg_t* handler_arg = new handler_arg_t{}; + handler_arg->features = features; + handler_arg->feature_count = feature_count; + + // Context properties + rocprofiler_pool_properties_t properties{}; + properties.num_entries = (CTX_OUTSTANDING_MAX != 0) ? CTX_OUTSTANDING_MAX : 1000; + properties.payload_bytes = sizeof(context_entry_t); + properties.handler = context_pool_handler; + properties.handler_arg = handler_arg; + + // Available GPU agents + const unsigned gpu_count = HsaRsrcFactory::Instance().GetCountOfGpuAgents(); + callbacks_arg_t* callbacks_arg = new callbacks_arg_t{}; + callbacks_arg->pools = new rocprofiler_pool_t* [gpu_count]; + for (unsigned gpu_id = 0; gpu_id < gpu_count; gpu_id++) { + // Getting GPU device info + const AgentInfo* agent_info = NULL; + if (HsaRsrcFactory::Instance().GetGpuAgentInfo(gpu_id, &agent_info) == false) { + fprintf(stderr, "GetGpuAgentInfo failed\n"); + abort(); + } + + // Open profiling pool + rocprofiler_pool_t* pool = NULL; + hsa_status_t status = rocprofiler_pool_open(agent_info->dev_id, features, features_found, + &pool, 0, &properties); + check_status(status); + callbacks_arg->pools[gpu_id] = pool; + } + + // Adding dispatch observer + rocprofiler_queue_callbacks_t callbacks_ptrs{0}; + callbacks_ptrs.dispatch = dispatch_callback_opt; + callbacks_ptrs.destroy = destroy_callback; + + rocprofiler_set_queue_callbacks(callbacks_ptrs, callbacks_arg); + + rocprofiler_hsa_callbacks_t cs{}; + cs.ksymbol = hsa_ksymbol_cb; + rocprofiler_set_hsa_callbacks(cs, NULL); + settings->code_obj_tracking = 0; + settings->hsa_intercepting = 1; + } else { + // Adding dispatch observer + rocprofiler_queue_callbacks_t callbacks_ptrs{0}; + callbacks_ptrs.dispatch = dispatch_callback; + callbacks_ptrs.destroy = destroy_callback; + + callbacks_data = new callbacks_data_t{}; + callbacks_data->features = features; + callbacks_data->feature_count = features_found; + callbacks_data->set = (metrics_set->empty()) ? NULL : metrics_set; + callbacks_data->group_index = 0; + callbacks_data->file_handle = result_file_handle; + callbacks_data->gpu_index = (gpu_index_vec->empty()) ? NULL : gpu_index_vec; + callbacks_data->kernel_string = (kernel_string_vec->empty()) ? NULL : kernel_string_vec; + callbacks_data->range = (range_vec->empty()) ? NULL : range_vec;; + callbacks_data->filter_on = (callbacks_data->gpu_index != NULL) || + (callbacks_data->kernel_string != NULL) || + (callbacks_data->range != NULL) + ? 1 : 0; + + rocprofiler_set_queue_callbacks(callbacks_ptrs, callbacks_data); + } xml::Xml::Destroy(xml); diff --git a/test/util/hsa_rsrc_factory.cpp b/test/util/hsa_rsrc_factory.cpp index 10f9fbc1..7d3301a3 100644 --- a/test/util/hsa_rsrc_factory.cpp +++ b/test/util/hsa_rsrc_factory.cpp @@ -24,6 +24,7 @@ POSSIBILITY OF SUCH DAMAGE. #include "util/hsa_rsrc_factory.h" +#include #include #include #include @@ -36,6 +37,7 @@ POSSIBILITY OF SUCH DAMAGE. #include #include #include +#include #include #include @@ -44,6 +46,14 @@ POSSIBILITY OF SUCH DAMAGE. #include #include +// Demangle C++ symbol name +static const char* cpp_demangle(const char* symname) { + size_t size = 0; + int status; + const char* ret = abi::__cxa_demangle(symname, NULL, &size, &status); + return (ret != 0) ? ret : strdup(symname); +} + // Callback function to get available in the system agents hsa_status_t HsaRsrcFactory::GetHsaAgentsCallback(hsa_agent_t agent, void* data) { hsa_status_t status = HSA_STATUS_ERROR; @@ -192,6 +202,7 @@ void HsaRsrcFactory::InitHsaApiTable(HsaApiTable* table) { hsa_api_.hsa_executable_create_alt = table->core_->hsa_executable_create_alt_fn; hsa_api_.hsa_executable_load_agent_code_object = table->core_->hsa_executable_load_agent_code_object_fn; hsa_api_.hsa_executable_freeze = table->core_->hsa_executable_freeze_fn; + hsa_api_.hsa_executable_destroy = table->core_->hsa_executable_destroy_fn; hsa_api_.hsa_executable_get_symbol = table->core_->hsa_executable_get_symbol_fn; hsa_api_.hsa_executable_symbol_get_info = table->core_->hsa_executable_symbol_get_info_fn; hsa_api_.hsa_executable_iterate_symbols = table->core_->hsa_executable_iterate_symbols_fn; @@ -232,6 +243,7 @@ void HsaRsrcFactory::InitHsaApiTable(HsaApiTable* table) { hsa_api_.hsa_executable_create_alt = hsa_executable_create_alt; hsa_api_.hsa_executable_load_agent_code_object = hsa_executable_load_agent_code_object; hsa_api_.hsa_executable_freeze = hsa_executable_freeze; + hsa_api_.hsa_executable_destroy = hsa_executable_destroy; hsa_api_.hsa_executable_get_symbol = hsa_executable_get_symbol; hsa_api_.hsa_executable_symbol_get_info = hsa_executable_symbol_get_info; hsa_api_.hsa_executable_iterate_symbols = hsa_executable_iterate_symbols; @@ -618,6 +630,8 @@ bool HsaRsrcFactory::LoadAndFinalize(const AgentInfo* agent_info, const char* br &kernelSymbol); CHECK_STATUS("Error in looking up kernel symbol", status); + close(file_handle); + // Update output parameter *code_desc = kernelSymbol; return true; @@ -693,52 +707,57 @@ uint64_t HsaRsrcFactory::Submit(hsa_queue_t* queue, const void* packet, size_t s return write_idx; } -const char* HsaRsrcFactory::GetKernelNameRef(uint64_t addr) { - std::lock_guard lck(mutex_); - const auto it = symbols_map_->find(addr); - if (it == symbols_map_->end()) { - fprintf(stderr, "HsaRsrcFactory::kernel addr (0x%lx) is not found\n", addr); - abort(); - } - return it->second; -} - -void HsaRsrcFactory::EnableExecutableTracking(HsaApiTable* table) { - std::lock_guard lck(mutex_); - executable_tracking_on_ = true; - table->core_->hsa_executable_freeze_fn = hsa_executable_freeze_interceptor; -} - -hsa_status_t HsaRsrcFactory::executable_symbols_cb(hsa_executable_t exec, hsa_executable_symbol_t symbol, void *data) { +hsa_status_t HsaRsrcFactory::executable_symbols_cb(hsa_executable_t exec, hsa_executable_symbol_t symbol, void *arg) { hsa_symbol_kind_t value = (hsa_symbol_kind_t)0; hsa_status_t status = hsa_api_.hsa_executable_symbol_get_info(symbol, HSA_EXECUTABLE_SYMBOL_INFO_TYPE, &value); CHECK_STATUS("Error in getting symbol info", status); + if (value == HSA_SYMBOL_KIND_KERNEL) { uint64_t addr = 0; - uint32_t len = 0; status = hsa_api_.hsa_executable_symbol_get_info(symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT, &addr); CHECK_STATUS("Error in getting kernel object", status); - status = hsa_api_.hsa_executable_symbol_get_info(symbol, HSA_EXECUTABLE_SYMBOL_INFO_NAME_LENGTH, &len); - CHECK_STATUS("Error in getting name len", status); - char *name = new char[len + 1]; - status = hsa_api_.hsa_executable_symbol_get_info(symbol, HSA_EXECUTABLE_SYMBOL_INFO_NAME, name); - CHECK_STATUS("Error in getting kernel name", status); - name[len] = 0; - auto ret = symbols_map_->insert({addr, name}); - if (ret.second == false) { - delete[] ret.first->second; - ret.first->second = name; + + const int to_free = reinterpret_cast(arg); + const char* name = NULL; + if (to_free == 0) { + uint32_t len = 0; + status = hsa_api_.hsa_executable_symbol_get_info(symbol, HSA_EXECUTABLE_SYMBOL_INFO_NAME_LENGTH, &len); + CHECK_STATUS("Error in getting name len", status); + char sym_name[len + 1]; + status = hsa_api_.hsa_executable_symbol_get_info(symbol, HSA_EXECUTABLE_SYMBOL_INFO_NAME, sym_name); + CHECK_STATUS("Error in getting kernel name", status); + sym_name[len] = 0; + name = cpp_demangle(sym_name); } + + SetKernelNameRef(addr, name, to_free); } + return HSA_STATUS_SUCCESS; } hsa_status_t HsaRsrcFactory::hsa_executable_freeze_interceptor(hsa_executable_t executable, const char *options) { std::lock_guard lck(mutex_); if (symbols_map_ == NULL) symbols_map_ = new symbols_map_t; - hsa_status_t status = hsa_api_.hsa_executable_iterate_symbols(executable, executable_symbols_cb, NULL); + hsa_status_t status = hsa_api_.hsa_executable_iterate_symbols(executable, executable_symbols_cb, (void*)0); CHECK_STATUS("Error in iterating executable symbols", status); - return hsa_api_.hsa_executable_freeze(executable, options);; + return hsa_api_.hsa_executable_freeze(executable, options); +} + +hsa_status_t HsaRsrcFactory::hsa_executable_destroy_interceptor(hsa_executable_t executable) { + std::lock_guard lck(mutex_); + if (symbols_map_ != NULL) { + hsa_status_t status = hsa_api_.hsa_executable_iterate_symbols(executable, executable_symbols_cb, (void*)1); + CHECK_STATUS("Error in iterating executable symbols", status); + } + return hsa_api_.hsa_executable_destroy(executable); +} + +void HsaRsrcFactory::EnableExecutableTracking(HsaApiTable* table) { + std::lock_guard lck(mutex_); + executable_tracking_on_ = true; + table->core_->hsa_executable_freeze_fn = hsa_executable_freeze_interceptor; + table->core_->hsa_executable_destroy_fn = hsa_executable_destroy_interceptor; } std::atomic HsaRsrcFactory::instance_{}; diff --git a/test/util/hsa_rsrc_factory.h b/test/util/hsa_rsrc_factory.h index e857813b..ca5a6e7a 100644 --- a/test/util/hsa_rsrc_factory.h +++ b/test/util/hsa_rsrc_factory.h @@ -95,6 +95,7 @@ struct hsa_pfn_t { decltype(hsa_executable_create_alt)* hsa_executable_create_alt; decltype(hsa_executable_load_agent_code_object)* hsa_executable_load_agent_code_object; decltype(hsa_executable_freeze)* hsa_executable_freeze; + decltype(hsa_executable_destroy)* hsa_executable_destroy; decltype(hsa_executable_get_symbol)* hsa_executable_get_symbol; decltype(hsa_executable_symbol_get_info)* hsa_executable_symbol_get_info; decltype(hsa_executable_iterate_symbols)* hsa_executable_iterate_symbols; @@ -286,6 +287,13 @@ class HsaRsrcFactory { typedef std::recursive_mutex mutex_t; typedef HsaTimer::timestamp_t timestamp_t; + // Executables loading tracking + struct symbols_map_data_t { + const char* name; + uint64_t refs_count; + }; + typedef std::map symbols_map_t; + static HsaRsrcFactory* Create(bool initialize_hsa = true) { std::lock_guard lck(mutex_); HsaRsrcFactory* obj = instance_.load(std::memory_order_relaxed); @@ -406,7 +414,88 @@ class HsaRsrcFactory { // Enable executables loading tracking static bool IsExecutableTracking() { return executable_tracking_on_; } static void EnableExecutableTracking(HsaApiTable* table); - static const char* GetKernelNameRef(uint64_t addr); + + typedef symbols_map_t::iterator symbols_map_it_t; + + static inline const char* GetKernelNameRef(const uint64_t& addr) { + if (symbols_map_ == NULL) { + fprintf(stderr, "HsaRsrcFactory::GetKernelNameRef: kernel addr (0x%lx), error\n", addr); + abort(); + } + + std::lock_guard lck(mutex_); + + const auto it = symbols_map_->find(addr); + if (it == symbols_map_->end()) { + fprintf(stderr, "HsaRsrcFactory::GetKernelNameRef: kernel addr (0x%lx) is not found\n", addr); + abort(); + } + + return it->second.name; + } + + static inline symbols_map_it_t AcquireKernelNameRef(const uint64_t& addr) { + if (symbols_map_ == NULL) { + fprintf(stderr, "HsaRsrcFactory::GetKernelNameRef: kernel addr (0x%lx), error\n", addr); + abort(); + } + + std::lock_guard lck(mutex_); + + const auto it = symbols_map_->find(addr); + if (it == symbols_map_->end()) { + fprintf(stderr, "HsaRsrcFactory::GetKernelNameRef: kernel addr (0x%lx) is not found\n", addr); + abort(); + } + + std::atomic* atomic_ptr = + reinterpret_cast*>(&(it->second.refs_count)); + atomic_ptr->fetch_add(1, std::memory_order_relaxed); + + return it; + } + + static inline void ReleaseKernelNameRef(const symbols_map_it_t& it) { + std::atomic* atomic_ptr = + reinterpret_cast*>(&(it->second.refs_count)); + atomic_ptr->fetch_sub(1, std::memory_order_relaxed); + } + + static inline void SetKernelNameRef(const uint64_t& addr, const char* name, const int& free) { + if (symbols_map_ == NULL) { + std::lock_guard lck(mutex_); + if (symbols_map_ == NULL) symbols_map_ = new symbols_map_t; + } + + auto it = symbols_map_->find(addr); + if (it != symbols_map_->end()) { + while (1) { + while(it->second.refs_count != 0) sched_yield(); + mutex_.lock(); + if (it->second.refs_count == 0) break; + mutex_.unlock(); + } + } + + if (it != symbols_map_->end()) { + delete[] it->second.name; + if (free == 1) { + symbols_map_->erase(it); + } else { + fprintf(stderr, "HsaRsrcFactory::SetKernelNameRef: to set kernel addr (0x%lx) conflict\n", addr); + abort(); + } + } else { + if (free == 0) { + symbols_map_->insert({addr, symbols_map_data_t{name, 0}}); + } else { + fprintf(stderr, "HsaRsrcFactory::SetKernelNameRef: to free kernel addr (0x%lx) not found\n", addr); + abort(); + } + } + + mutex_.unlock(); + } // Initialize HSA API table void static InitHsaApiTable(HsaApiTable* table); @@ -492,11 +581,10 @@ class HsaRsrcFactory { // System agents map std::map agent_map_; - // Executables loading tracking - typedef std::map symbols_map_t; static symbols_map_t* symbols_map_; static bool executable_tracking_on_; static hsa_status_t hsa_executable_freeze_interceptor(hsa_executable_t executable, const char *options); + static hsa_status_t hsa_executable_destroy_interceptor(hsa_executable_t executable); static hsa_status_t executable_symbols_cb(hsa_executable_t exec, hsa_executable_symbol_t symbol, void *data); // HSA runtime API table From b9e5f11509988e7edf13485beea6b9b1dd785700 Mon Sep 17 00:00:00 2001 From: Evgeny Date: Fri, 21 Aug 2020 10:51:18 -0500 Subject: [PATCH 125/153] merge fix --- src/core/context.h | 1 - src/core/rocprofiler.cpp | 4 +++- test/tool/tool.cpp | 2 -- 3 files changed, 3 insertions(+), 4 deletions(-) diff --git a/src/core/context.h b/src/core/context.h index 8be3a9e8..3116e036 100644 --- a/src/core/context.h +++ b/src/core/context.h @@ -499,7 +499,6 @@ class Context { hsa_ven_amd_aqlprofile_info_data_t* ainfo_data, void* data) { hsa_status_t status = HSA_STATUS_SUCCESS; callback_data_t* callback_data = reinterpret_cast(data); - const profile_t* profile = callback_data->profile; info_vector_t& info_vector = *(callback_data->info_vector); uint32_t index = callback_data->index; const uint32_t sample_id = ainfo_data->sample_id; diff --git a/src/core/rocprofiler.cpp b/src/core/rocprofiler.cpp index e53d7257..d5af91c2 100644 --- a/src/core/rocprofiler.cpp +++ b/src/core/rocprofiler.cpp @@ -706,7 +706,9 @@ PUBLIC_API hsa_status_t rocprofiler_stop_queue_callbacks() { // Method for iterating the events output data PUBLIC_API hsa_status_t rocprofiler_iterate_trace_data( - rocprofiler_t* handle, hsa_ven_amd_aqlprofile_data_callback_t callback, void* data) {} + rocprofiler_t* handle, hsa_ven_amd_aqlprofile_data_callback_t callback, void* data) { + return HSA_STATUS_ERROR; +} //////////////////////////////////////////////////////////////////////////////// // Open profiling pool diff --git a/test/tool/tool.cpp b/test/tool/tool.cpp index 6b2adf8a..30e35504 100644 --- a/test/tool/tool.cpp +++ b/test/tool/tool.cpp @@ -334,7 +334,6 @@ void output_results(const context_entry_t* entry, const char* label) { FILE* file = entry->file_handle; const rocprofiler_feature_t* features = entry->features; const unsigned feature_count = entry->feature_count; - rocprofiler_t* context = entry->group.context; for (unsigned i = 0; i < feature_count; ++i) { const rocprofiler_feature_t* p = &features[i]; @@ -1088,7 +1087,6 @@ extern "C" PUBLIC_API void OnLoadToolProp(rocprofiler_settings_t* settings) bool opt_mode_cond = ((features_found != 0) && (metrics_set->empty()) && - (traces_found == 0) && (filter_disabled == true)); if (settings->opt_mode == 0) opt_mode_cond = false; if (!opt_mode_cond) settings->opt_mode = 0; From fea1fd598c092ec88163f054aa8924df950be351 Mon Sep 17 00:00:00 2001 From: Evgeny Date: Fri, 21 Aug 2020 13:30:16 -0500 Subject: [PATCH 126/153] clang warinig fix --- src/core/context.h | 4 ---- 1 file changed, 4 deletions(-) diff --git a/src/core/context.h b/src/core/context.h index 3116e036..d23c93ba 100644 --- a/src/core/context.h +++ b/src/core/context.h @@ -330,7 +330,6 @@ class Context { metrics_(NULL), handler_(handler), handler_arg_(handler_arg), - pcsmp_mode_(false), dispatch_signal_{}, orig_signal_{}, record_{} @@ -556,9 +555,6 @@ class Context { rocprofiler_handler_t handler_; void* handler_arg_; - // PC sampling mode - bool pcsmp_mode_; - // kernel packet dispatch copmletion signal hsa_signal_t dispatch_signal_; hsa_signal_t orig_signal_; From a2e7bfad92755f55d7a53d5b1f31718e784ad924 Mon Sep 17 00:00:00 2001 From: Evgeny Date: Sun, 20 Sep 2020 22:30:02 -0500 Subject: [PATCH 127/153] 3.8 update --- bin/rpl_run.sh | 4 + bin/tblextr.py | 119 ++++++++++++++------- inc/rocprofiler.h | 2 + src/core/context.h | 196 ++++++++++++++++++++++++---------- src/core/intercept_queue.h | 48 +++++---- src/core/profile.h | 115 ++++++++++++++++++-- src/core/rocprofiler.cpp | 56 +++++++--- src/core/tracker.h | 39 ++++--- src/util/hsa_rsrc_factory.cpp | 15 ++- test/tool/tool.cpp | 13 ++- 10 files changed, 454 insertions(+), 153 deletions(-) diff --git a/bin/rpl_run.sh b/bin/rpl_run.sh index e98561b4..f45b8312 100755 --- a/bin/rpl_run.sh +++ b/bin/rpl_run.sh @@ -183,6 +183,7 @@ usage() { echo " Supported time formats: " echo " --flush-rate - to enable trace flush rate (time period)" echo " Supported time formats: " + echo " --parallel-kernels - to enable cnocurrent kernels" echo "" echo "Configuration file:" echo " You can set your parameters defaults preferences in the configuration file 'rpl_rc.xml'. The search path sequence: .:${HOME}:" @@ -439,6 +440,9 @@ while [ 1 ] ; do if [ "$2" = "off" ] ; then export ROCP_OBJ_TRACKING=0 fi + elif [ "$1" = "--parallel-kernels" ] ; then + ARG_VAL=0 + export ROCP_K_CONCURRENT=1 elif [ "$1" = "--verbose" ] ; then ARG_VAL=0 export ROCP_VERBOSE_MODE=1 diff --git a/bin/tblextr.py b/bin/tblextr.py index 60d99db3..1b39a415 100755 --- a/bin/tblextr.py +++ b/bin/tblextr.py @@ -240,11 +240,13 @@ def fill_kernel_db(table_name, db): ] def fill_ext_db(table_name, db, indir, trace_name, api_pid): file_name = indir + '/' + trace_name + '_trace.txt' - ptrn_val = re.compile(r'(\d+) (\d+):(\d+) (\d+):(.*)$') + # tms pid:tid cid:rid:'.....' + ptrn_val = re.compile(r'(\d+) (\d+):(\d+) (\d+):(\d+):(.*)$') if not os.path.isfile(file_name): return 0 range_stack = {} + range_map = {} record_id = 0 table_handle = db.add_table(table_name, ext_table_descr) @@ -257,7 +259,8 @@ def fill_ext_db(table_name, db, indir, trace_name, api_pid): pid = m.group(2) tid = m.group(3) cid = int(m.group(4)) - msg = m.group(5) + rid = int(m.group(5)) + msg = m.group(6) rec_vals = [] @@ -285,6 +288,21 @@ def fill_ext_db(table_name, db, indir, trace_name, api_pid): rec_vals = rec_stack.pop() rec_vals[1] = tms + # range start + if cid == 3: + range_map[rid] = (tms, msg) + continue + + # range stop + if cid == 4: + if rid in range_map: + (tms, msg) = range_map[rid] # querying start timestamp if rid exists + del range_map[rid] + else: fatal("range id(" + str(rid) + ") is not found") + rec_vals[0] = tms # begin timestamp + rec_vals[3] = 0 # 0 lane for ranges + rec_vals[4] = msg # range message + db.insert_entry(table_handle, rec_vals) record_id += 1 @@ -305,6 +323,8 @@ def get_field(args, field): def set_field(args, field, val): return re.subn(field + '\(\w+\)([ \)])', field + '(' + str(val) + ')\\1', args, count=1) +ops_patch_data = {} + # Fill API DB api_table_descr = [ ['BeginNs', 'EndNs', 'pid', 'tid', 'Name', 'args', 'Index', 'Data'], @@ -329,6 +349,7 @@ def fill_api_db(table_name, db, indir, api_name, api_pid, dep_pid, dep_list, dep file_name = indir + '/' + api_name + '_api_trace.txt' ptrn_val = re.compile(r'(\d+):(\d+) (\d+):(\d+) ([^\(]+)(\(.*)$') + hip_mcopy_ptrn = re.compile(r'hipMemcpy') ptrn_ac = re.compile(r'hsa_amd_memory_async_copy') ptrn1_kernel = re.compile(r'^.*kernel\(') ptrn2_kernel = re.compile(r'\)\) .*$') @@ -346,7 +367,16 @@ def fill_api_db(table_name, db, indir, api_name, api_pid, dep_pid, dep_list, dep record_id_dict = {} table_handle = db.add_table(table_name, api_table_descr) with open(file_name, mode='r') as fd: - for line in fd.readlines(): + file_lines = fd.readlines() + total_lines = len(file_lines) + line_index = 0 + for line in file_lines: + if (line_index == total_lines - 1) or (line_index % 100 == 0): + sys.stdout.write( \ + "\rscan " + api_name + " API data " + str(line_index) + ":" + str(total_lines) + " "*100 \ + ) + line_index += 1 + record = line[:-1] kernel_arg = '' @@ -361,7 +391,8 @@ def fill_api_db(table_name, db, indir, api_name, api_pid, dep_pid, dep_list, dep record = mfixformat.group(1) + '( ' + reformated_args + ')' m = ptrn_val.match(record) - if m: + if not m: fatal(api_name + " bad record: '" + record + "'") + else: rec_vals = [] rec_len = len(api_table_descr[0]) - 1 for ind in range(1, rec_len): @@ -385,11 +416,17 @@ def fill_api_db(table_name, db, indir, api_name, api_pid, dep_pid, dep_list, dep if found == 0: fatal('set_field() failed for "stream", args: "' + record_args + '"') else: stream_id = 0 + # extract kernel name string + (kernel_str, kernel_found) = get_field(record_args, 'kernel') + + if stream_found != 0 or kernel_found != 0: + ops_patch_data[(corr_id, proc_id)] = (stream_id if stream_found else 0, kernel_str if kernel_found else '') + # dependencies filling - if ptrn_ac.search(record_name) or (corr_id, proc_id) in dep_filtr: + if ptrn_ac.match(record_name) or hip_mcopy_ptrn.match(record_name): beg_ns = int(rec_vals[0]) end_ns = int(rec_vals[1]) - from_us = (beg_ns / 1000) + ((end_ns - beg_ns) / 1000) + from_us = end_ns / 1000 if not proc_id in dep_dict: dep_dict[proc_id] = {} dep_proc = dep_dict[proc_id] @@ -412,39 +449,31 @@ def fill_api_db(table_name, db, indir, api_name, api_pid, dep_pid, dep_list, dep copy_csv += str(copy_index) + ', ' + copy_line + '\n' copy_index += 1 - # patching activity properties: kernel name, stream-id - if (corr_id, proc_id) in dep_filtr: - ops_table_name = dep_filtr[(corr_id, proc_id)] - - select_expr = '"Index" = ' + str(corr_id) + ' AND "proc-id" = ' + str(proc_id) - record_args = rec_vals[rec_len - 2] - - # extract kernel name string - (kernel_str, kernel_found) = get_field(record_args, 'kernel') - is_kernel_list = 1 if kernel_found != 0 and kernel_str[-1] == ';' else 0 - - if is_kernel_list != 0: - for kernel_item in kernel_str[:-1].split(';'): - m = ptrn_multi_kernel.match(kernel_item) - if m: - kernel_name = m.group(1) - dev_id = m.group(2) - select_expr += ' AND "dev-id" = ' + dev_id - activity_record_patching(db, ops_table_name, 1, kernel_name, stream_found, stream_id, select_expr) - else: - fatal('Bad multi-kernel format: "' + kernel_item + '" in "' + kernel_str + '"') - else: - activity_record_patching(db, ops_table_name, kernel_found, kernel_str, stream_found, stream_id, select_expr) - - api_data = '' - if mcopy_data_enabled: - api_data = memory_manager.register_api(rec_vals) if len(dep_filtr) else '' + if False: + # patching activity properties: kernel name, stream-id + if (corr_id, proc_id) in dep_filtr: + ops_table_name = dep_filtr[(corr_id, proc_id)] + select_expr = '"Index" = ' + str(corr_id) + ' AND "proc-id" = ' + str(proc_id) + is_kernel_list = 1 if kernel_found != 0 and kernel_str[-1] == ';' else 0 + if is_kernel_list != 0: + for kernel_item in kernel_str[:-1].split(';'): + m = ptrn_multi_kernel.match(kernel_item) + if m: + kernel_name = m.group(1) + dev_id = m.group(2) + select_expr += ' AND "dev-id" = ' + dev_id + activity_record_patching(db, ops_table_name, 1, kernel_name, stream_found, stream_id, select_expr) + else: + fatal('Bad multi-kernel format: "' + kernel_item + '" in "' + kernel_str + '"') + else: + activity_record_patching(db, ops_table_name, kernel_found, kernel_str, stream_found, stream_id, select_expr) + + api_data = memory_manager.register_api(rec_vals) if mcopy_data_enabled and api_name == 'hip' else '' rec_vals.append(api_data) rec_vals[2] = api_pid db.insert_entry(table_handle, rec_vals) - else: fatal(api_name + " bad record: '" + record + "'") # inserting of dispatch events correlated to the dependent dispatches for (from_ns, proc_id, thrd_id) in dep_list: @@ -528,7 +557,16 @@ def fill_ops_db(kernel_table_name, mcopy_table_name, db, indir): kernel_table_handle = db.add_table(kernel_table_name, ops_table_descr) mcopy_table_handle = db.add_table(mcopy_table_name, ops_table_descr) with open(file_name, mode='r') as fd: - for line in fd.readlines(): + file_lines = fd.readlines() + total_lines = len(file_lines) + line_index = 0 + for line in file_lines: + if (line_index == total_lines - 1) or (line_index % 100 == 0): + sys.stdout.write( \ + "\rscan ops data " + str(line_index) + ":" + str(total_lines) + " "*100 \ + ) + line_index += 1 + record = line[:-1] m = ptrn_val.match(record) if m: @@ -558,10 +596,17 @@ def fill_ops_db(kernel_table_name, mcopy_table_name, db, indir): if ptrn_barrier.search(name): name = '""' + tid = 0 + if (corr_id, proc_id) in ops_patch_data: + vals = ops_patch_data[(corr_id, proc_id)] + tid = vals[0] + name_patch = vals[1] + if name_patch != '': name = name_patch + # insert DB record rec_vals[4] = name # Name rec_vals.append(pid) # pid - rec_vals.append(0) # tid + rec_vals.append(tid) # tid rec_vals.append(corr_id) # Index rec_vals.append(proc_id) # proc-id rec_vals.append('') # Data @@ -641,8 +686,8 @@ def fill_ops_db(kernel_table_name, mcopy_table_name, db, indir): hsa_activity_found = fill_copy_db('COPY', db, indir) hsa_trace_found = fill_api_db('HSA', db, indir, 'hsa', HSA_PID, COPY_PID, kern_dep_list, {}, 0) + hip_trace_found = fill_api_db('HIP', db, indir, 'hip', HIP_PID, OPS_PID, [], {}, 1) ops_filtr = fill_ops_db('OPS', 'COPY', db, indir) - hip_trace_found = fill_api_db('HIP', db, indir, 'hip', HIP_PID, OPS_PID, [], ops_filtr, 1) fill_kernel_db('A', db) diff --git a/inc/rocprofiler.h b/inc/rocprofiler.h index 3f295a15..b176cadf 100644 --- a/inc/rocprofiler.h +++ b/inc/rocprofiler.h @@ -92,6 +92,8 @@ hsa_status_t rocprofiler_error_string( typedef enum { ROCPROFILER_FEATURE_KIND_METRIC = 0, ROCPROFILER_FEATURE_KIND_TRACE = 1, + ROCPROFILER_FEATURE_KIND_SPM_MOD = 2, + ROCPROFILER_FEATURE_KIND_PCSMP_MOD = 4 } rocprofiler_feature_kind_t; // Profiling feture parameter diff --git a/src/core/context.h b/src/core/context.h index d23c93ba..c368d42c 100644 --- a/src/core/context.h +++ b/src/core/context.h @@ -83,10 +83,16 @@ class Group { Group(const util::AgentInfo* agent_info, Context* context, const uint32_t& index) : pmc_profile_(agent_info), + trace_profile_(agent_info), n_profiles_(0), refs_(1), context_(context), - index_(index) {} + index_(index), + barrier_signal_{}, + dispatch_signal_{}, + orig_signal_{}, + record_{} + {} void Insert(const profile_info_t& info) { const rocprofiler_feature_kind_t kind = info.rinfo->kind; @@ -95,6 +101,9 @@ class Group { case ROCPROFILER_FEATURE_KIND_METRIC: pmc_profile_.Insert(info); break; + case ROCPROFILER_FEATURE_KIND_TRACE: + trace_profile_.Insert(info); + break; default: EXC_RAISING(HSA_STATUS_ERROR, "bad rocprofiler feature kind (" << kind << ")"); } @@ -103,16 +112,24 @@ class Group { hsa_status_t Finalize(const bool is_concurrent = false) { hsa_status_t status = pmc_profile_.Finalize(start_vector_, stop_vector_, read_vector_, is_concurrent); + if (status == HSA_STATUS_SUCCESS) { + status = trace_profile_.Finalize(start_vector_, stop_vector_, + read_vector_, is_concurrent); + } if (status == HSA_STATUS_SUCCESS) { if (!pmc_profile_.Empty()) ++n_profiles_; + if (!trace_profile_.Empty()) ++n_profiles_; } return status; } void GetProfiles(profile_vector_t& vec) { pmc_profile_.GetProfiles(vec); + trace_profile_.GetProfiles(vec); } + void GetTraceProfiles(profile_vector_t& vec) { trace_profile_.GetProfiles(vec); } + info_vector_t& GetInfoVector() { return info_vector_; } const pkt_vector_t& GetStartVector() const { return start_vector_; } const pkt_vector_t& GetStopVector() const { return stop_vector_; } @@ -120,6 +137,28 @@ class Group { Context* GetContext() { return context_; } uint32_t GetIndex() const { return index_; } + void SetBarrierSignal(const hsa_signal_t &signal) { + barrier_signal_ = signal; + } + hsa_signal_t& GetBarrierSignal() { + return barrier_signal_; + } + void SetDispatchSignal(const hsa_signal_t &signal) { + dispatch_signal_ = signal; + } + hsa_signal_t& GetDispatchSignal() { + return dispatch_signal_; + } + void SetOrigSignal(const hsa_signal_t &signal) { + orig_signal_ = signal; + } + const hsa_signal_t& GetOrigSignal() const { + return orig_signal_; + } + rocprofiler_dispatch_record_t* GetRecord() { + return &record_; + } + atomic_refs_t* AtomicRefsCount() { return reinterpret_cast(&refs_); } void ResetRefsCount() { AtomicRefsCount()->store(n_profiles_, std::memory_order_release); } void IncrRefsCount() { AtomicRefsCount()->fetch_add(1, std::memory_order_acq_rel); } @@ -127,6 +166,7 @@ class Group { private: PmcProfile pmc_profile_; + TraceProfile trace_profile_; info_vector_t info_vector_; pkt_vector_t start_vector_; pkt_vector_t stop_vector_; @@ -135,6 +175,12 @@ class Group { refs_t refs_; Context* const context_; const uint32_t index_; + // completion signal of after-dispatch barrier + hsa_signal_t barrier_signal_; + // completion signal kernel packet dispatch + hsa_signal_t dispatch_signal_; + hsa_signal_t orig_signal_; + rocprofiler_dispatch_record_t record_; }; // Profiling context @@ -231,11 +277,21 @@ class Context { char* ptr; }; + void RestoreSignals(const profile_tuple_t& tuple) { + hsa_rsrc_->HsaApi()->hsa_signal_store_screlease(tuple.dispatch_signal, 1); + if (k_concurrent_) { + hsa_rsrc_->HsaApi()->hsa_signal_store_screlease(tuple.read_signal, 1); + hsa_rsrc_->HsaApi()->hsa_signal_store_screlease(tuple.barrier_signal, 1); + } + } + void GetData(const uint32_t& group_index) { const profile_vector_t profile_vector = GetProfiles(group_index); for (auto& tuple : profile_vector) { // Wait for stop packet to complete hsa_rsrc_->SignalWaitRestore(tuple.completion_signal, 1); + // Restore other signals + RestoreSignals(tuple); for (rocprofiler_feature_t* rinfo : *(tuple.info_vector)) rinfo->data.kind = ROCPROFILER_DATA_KIND_UNINIT; callback_data_t callback_data{tuple.profile, tuple.info_vector, tuple.info_vector->size(), NULL}; const hsa_status_t status = @@ -261,28 +317,15 @@ class Context { } } - /* Handle the completion of kernel-begin 'read' packet */ - static bool HandlerRead(hsa_signal_value_t value, void* arg) { - Group* group = reinterpret_cast(arg); - Context* context = group->GetContext(); - - // Handle the completion signal of read packet at kernel begin - const profile_vector_t profile_vector = context->GetProfiles(group->GetIndex()); + void IterateTraceData(rocprofiler_trace_data_callback_t callback, void* data) { + profile_vector_t profile_vector; + set_[0].GetTraceProfiles(profile_vector); for (auto& tuple : profile_vector) { - // Wait for read packet to complete - util::HsaRsrcFactory::Instance().SignalWaitRestore(tuple.completion_signal, 1); - const profile_t* profile = tuple.profile; - // Copy the counter values, read at kernel begin, to the right half of - // the buffer, so that the next kernel-end read can reuse the left half - char* data = reinterpret_cast(profile->output_buffer.ptr); - const uint32_t num = profile->output_buffer.size / 2; - for(uint32_t i = 0; i < num; ++i) { - data[i+num] = data[i]; // left --> right - data[i] = 0; // reset left - } + if (pcsmp_mode_) const_cast(tuple.profile)->event_count = UINT32_MAX; + const hsa_status_t status = + api_->hsa_ven_amd_aqlprofile_iterate_data(tuple.profile, callback, data); + if (status != HSA_STATUS_SUCCESS) AQL_EXC_RAISING(status, "context iterate data failed"); } - - return false; } static bool Handler(hsa_signal_value_t value, void* arg) { @@ -300,24 +343,10 @@ class Context { Group* GetGroup(const uint32_t& index) { return &set_[index]; } rocprofiler_handler_t GetHandler(void** arg) const { *arg = handler_arg_; return handler_; } - void SetDispatchSignal(const hsa_signal_t &signal) { - dispatch_signal_ = signal; - } - hsa_signal_t& GetDispatchSignal() { - return dispatch_signal_; - } - void SetOrigSignal(const hsa_signal_t &signal) { - orig_signal_ = signal; - } - const hsa_signal_t& GetOrigSignal() const { - return orig_signal_; - } - rocprofiler_dispatch_record_t* GetRecord() { - return &record_; - } - // Concurrent profiling mode static bool k_concurrent_; + // Packets to stop the profiling + static pkt_vector_t stop_packets_; private: Context(const util::AgentInfo* agent_info, Queue* queue, rocprofiler_feature_t* info, @@ -330,15 +359,12 @@ class Context { metrics_(NULL), handler_(handler), handler_arg_(handler_arg), - dispatch_signal_{}, - orig_signal_{}, - record_{} + pcsmp_mode_(false) {} ~Context() { Destruct(); } void Destruct() { - hsa_signal_destroy(dispatch_signal_); for (const auto& v : info_map_) { const std::string& name = v.first; const rocprofiler_feature_t* info = v.second; @@ -373,20 +399,14 @@ class Context { set_[group_index].ResetRefsCount(); const profile_vector_t profile_vector = GetProfiles(group_index); for (auto& tuple : profile_vector) { - // Handler for read packet completion - if (k_concurrent_) { - hsa_amd_signal_async_handler(tuple.completion_signal, HSA_SIGNAL_CONDITION_LT, 1, HandlerRead, - &set_[group_index]); - } + set_[group_index].SetDispatchSignal(tuple.dispatch_signal); + set_[group_index].SetBarrierSignal(tuple.barrier_signal); // Handler for stop packet completion hsa_amd_signal_async_handler(tuple.completion_signal, HSA_SIGNAL_CONDITION_LT, 1, Handler, &set_[group_index]); } } } - - hsa_status_t status = hsa_signal_create(1, 0, NULL, &dispatch_signal_); - if (status != HSA_STATUS_SUCCESS) EXC_RAISING(status, "MetricsDict create failed"); } // Initialize rocprofiler context @@ -469,6 +489,23 @@ class Context { const uint32_t group_index = block_status.group_index; set_[group_index].Insert(profile_info_t{event, NULL, 0, info}); } + } else if (kind & ROCPROFILER_FEATURE_KIND_TRACE) { // Processing traces features + info->kind = ROCPROFILER_FEATURE_KIND_TRACE; + + const event_t* event = NULL; + if (kind & ROCPROFILER_FEATURE_KIND_PCSMP_MOD) { // PC sampling + pcsmp_mode_ = true; + } else if (kind & ROCPROFILER_FEATURE_KIND_SPM_MOD) { // SPM trace + const Metric* metric = metrics_->Get(name); + if (metric == NULL) + EXC_RAISING(HSA_STATUS_ERROR, "input metric '" << name << "' is not found"); + counters_vec_t counters_vec = metric->GetCounters(); + if (counters_vec.size() != 1) + EXC_RAISING(HSA_STATUS_ERROR, "trace bad metric '" << name << "' is not base counter"); + const counter_t* counter = counters_vec[0]; + event = &(counter->event); + } + set_[0].Insert(profile_info_t{event, info->parameters, info->parameter_count, info}); } else { EXC_RAISING(HSA_STATUS_ERROR, "bad rocprofiler feature kind (" << kind << ")"); } @@ -498,6 +535,7 @@ class Context { hsa_ven_amd_aqlprofile_info_data_t* ainfo_data, void* data) { hsa_status_t status = HSA_STATUS_SUCCESS; callback_data_t* callback_data = reinterpret_cast(data); + const profile_t* profile = callback_data->profile; info_vector_t& info_vector = *(callback_data->info_vector); uint32_t index = callback_data->index; const uint32_t sample_id = ainfo_data->sample_id; @@ -516,6 +554,56 @@ class Context { if (ainfo_data->sample_id == 0) rinfo->data.result_int64 = 0; rinfo->data.result_int64 += ainfo_data->pmc_data.result; rinfo->data.kind = ROCPROFILER_DATA_KIND_INT64; + } else if (ainfo_type == HSA_VEN_AMD_AQLPROFILE_INFO_TRACE_DATA) { + if (rinfo->data.result_bytes.copy) { + const bool trace_local = TraceProfile::IsLocal(); + util::HsaRsrcFactory* hsa_rsrc = &util::HsaRsrcFactory::Instance(); + if (sample_id == 0) { + const uint32_t output_buffer_size = profile->output_buffer.size; + const uint32_t output_buffer_size64 = profile->output_buffer.size / sizeof(uint64_t); + const util::AgentInfo* agent_info = hsa_rsrc->GetAgentInfo(profile->agent); + void* ptr = (trace_local) ? hsa_rsrc->AllocateSysMemory(agent_info, output_buffer_size) : + calloc(output_buffer_size64, sizeof(uint64_t)); + rinfo->data.result_bytes.size = output_buffer_size; + rinfo->data.result_bytes.ptr = ptr; + callback_data->ptr = reinterpret_cast(ptr); + } + char* result_bytes_ptr = reinterpret_cast(rinfo->data.result_bytes.ptr); + const char* end = result_bytes_ptr + rinfo->data.result_bytes.size; + const char* src = reinterpret_cast(ainfo_data->trace_data.ptr); + uint32_t size = ainfo_data->trace_data.size; + char* ptr = callback_data->ptr; + uint32_t* header = reinterpret_cast(ptr); + char* dest = ptr + sizeof(*header); + + if ((dest + size) >= end) { + if (dest < end) size = end - dest; + else EXC_RAISING(HSA_STATUS_ERROR, "Trace data out of output buffer"); + } + + bool suc = true; + if (trace_local) { + suc = hsa_rsrc->Memcpy(profile->agent, dest, src, size); + } else { + memcpy(dest, src, size); + } + if (suc) { + *header = size; + callback_data->ptr = dest + align_size(size, sizeof(uint32_t)); + rinfo->data.result_bytes.instance_count = sample_id + 1; + rinfo->data.kind = ROCPROFILER_DATA_KIND_BYTES; + } else + EXC_RAISING(HSA_STATUS_ERROR, "Agent Memcpy failed, dst(" << (void*)dest << ") src(" << (void*)src << ") size(" << size << ")"); + } else { + if (sample_id == 0) { + rinfo->data.result_bytes.ptr = profile->output_buffer.ptr; + rinfo->data.result_bytes.size = profile->output_buffer.size; + rinfo->data.result_bytes.instance_count = UINT32_MAX; + } + + rinfo->data.result_bytes.instance_count += 1; + rinfo->data.kind = ROCPROFILER_DATA_KIND_BYTES; + } } else { EXC_RAISING(HSA_STATUS_ERROR, "unknown data type = " << ainfo_type); } @@ -555,15 +643,13 @@ class Context { rocprofiler_handler_t handler_; void* handler_arg_; - // kernel packet dispatch copmletion signal - hsa_signal_t dispatch_signal_; - hsa_signal_t orig_signal_; - rocprofiler_dispatch_record_t record_; - + // PC sampling mode + bool pcsmp_mode_; }; #define CONTEXT_INSTANTIATE() \ - bool rocprofiler::Context::k_concurrent_ = false; + bool rocprofiler::Context::k_concurrent_ = false; \ + std::vector rocprofiler::Context::stop_packets_{}; } // namespace rocprofiler diff --git a/src/core/intercept_queue.h b/src/core/intercept_queue.h index 5cd09b10..946ba424 100644 --- a/src/core/intercept_queue.h +++ b/src/core/intercept_queue.h @@ -49,8 +49,6 @@ enum { extern decltype(hsa_queue_create)* hsa_queue_create_fn; extern decltype(hsa_queue_destroy)* hsa_queue_destroy_fn; -void PmcStarter(Context* context); - static std::mutex ctx_a_mutex; typedef std::map ctx_a_map_t; static ctx_a_map_t* ctx_a_map = NULL; @@ -186,8 +184,8 @@ class InterceptQueue { if ((status == HSA_STATUS_SUCCESS) && (context != NULL)) { if (group.feature_count != 0) { if (tracker_ != NULL) { - const_cast(dispatch_packet)->completion_signal = context->GetDispatchSignal(); Group* context_group = context->GetGroup(group.index); + const_cast(dispatch_packet)->completion_signal = context_group->GetDispatchSignal(); Tracker::Enable_opt(context_group, completion_signal); context_group->IncrRefsCount(); } @@ -271,9 +269,11 @@ class InterceptQueue { // Adding kernel timing tracker Tracker::entry_t* tracker_entry = NULL; + + const bool is_serial = (k_concurrent_ == K_CONC_OFF); if (tracker_ != NULL) { - tracker_entry = tracker_->Alloc(obj->agent_info_->dev_id, dispatch_packet->completion_signal); - const_cast(dispatch_packet)->completion_signal = tracker_entry->signal; + tracker_entry = tracker_->Alloc(obj->agent_info_->dev_id, dispatch_packet->completion_signal, is_serial); + if (is_serial) const_cast(dispatch_packet)->completion_signal = tracker_entry->signal; } // Prepareing dispatch callback data @@ -297,43 +297,55 @@ class InterceptQueue { // Calling dispatch callback rocprofiler_group_t group = {}; hsa_status_t status = (dispatch_callback_.load())(&data, callback_data_, &group); - // Injecting profiling start/stop packets + // Injecting profiling start/stop/read packets if ((status != HSA_STATUS_SUCCESS) || (group.context == NULL)) { if (tracker_entry != NULL) { - const_cast(dispatch_packet)->completion_signal = tracker_entry->orig; + if (is_serial) const_cast(dispatch_packet)->completion_signal = tracker_entry->orig; tracker_->Delete(tracker_entry); } } else { Context* context = reinterpret_cast(group.context); if (group.feature_count != 0) { - if (tracker_entry != NULL) { - Group* context_group = context->GetGroup(group.index); - context_group->IncrRefsCount(); - tracker_->EnableContext(tracker_entry, Context::Handler, reinterpret_cast(context_group)); - } - const pkt_vector_t& start_vector = context->StartPackets(group.index); const pkt_vector_t& stop_vector = context->StopPackets(group.index); const pkt_vector_t& read_vector = context->ReadPackets(group.index); pkt_vector_t packets; - if (k_concurrent_ == K_CONC_OFF) { // serial + if (is_serial) { // serial packets = start_vector; packets.insert(packets.end(), *packet); packets.insert(packets.end(), stop_vector.begin(), stop_vector.end()); } else { // concurrent - // Atrt PMC once - std::call_once(once_flag_, PmcStarter, context); - // Reads at both kernel start and end - assert(read_vector.size() == 2 * start_vector.size()); + // Insert start packets once + auto inject_start = [&packets](const pkt_vector_t& starts) mutable { + packets = starts; + }; + std::call_once(once_flag_, inject_start, start_vector); + // Reads at both kernel start and end (also with barriers) + assert(read_vector.size() >= 2 * start_vector.size()); auto mid = read_vector.begin() + read_vector.size()/2; // Read at kernel start packets.insert(packets.end(), read_vector.begin(), mid); // Kernel dispatch packet + assert(tracker_entry != NULL); + // Bind dispatch and barrier signals with tracker entry + tracker_->SetHandler(tracker_entry, context->GetGroup(group.index)); + const_cast(dispatch_packet)->completion_signal = context->GetGroup(group.index)->GetDispatchSignal(); packets.insert(packets.end(), *packet); // Read at kernel end packets.insert(packets.end(), mid, read_vector.end()); + + // Save the stop packets for eventual PmcStopper + if (Context::stop_packets_.empty()) { + Context::stop_packets_.insert(Context::stop_packets_.end(), stop_vector.begin(), stop_vector.end()); + } + } + + if (tracker_entry != NULL) { + Group* context_group = context->GetGroup(group.index); + context_group->IncrRefsCount(); + tracker_->EnableContext(tracker_entry, Context::Handler, reinterpret_cast(context_group)); } if (writer != NULL) { diff --git a/src/core/profile.h b/src/core/profile.h index f6165d07..09ad2644 100644 --- a/src/core/profile.h +++ b/src/core/profile.h @@ -45,6 +45,9 @@ struct profile_tuple_t { const profile_t* profile; info_vector_t* info_vector; hsa_signal_t completion_signal; + hsa_signal_t dispatch_signal; + hsa_signal_t barrier_signal; + hsa_signal_t read_signal; }; typedef std::vector profile_vector_t; @@ -102,6 +105,9 @@ class Profile { profile_ = {}; profile_.agent = agent_info->dev_id; completion_signal_ = {}; + dispatch_signal_ = {}; + barrier_signal_ = {}; + read_signal_ = {}; is_legacy_ = (strncmp(agent_info->name, "gfx8", 4) == 0); } @@ -115,6 +121,18 @@ class Profile { hsa_status_t status = hsa_signal_destroy(completion_signal_); if (status != HSA_STATUS_SUCCESS) EXC_ABORT(status, "signal_destroy " << std::hex << status); } + if (dispatch_signal_.handle) { + hsa_status_t status = hsa_signal_destroy(dispatch_signal_); + if (status != HSA_STATUS_SUCCESS) EXC_ABORT(status, "signal_destroy " << std::hex << status); + } + if (barrier_signal_.handle) { + hsa_status_t status = hsa_signal_destroy(barrier_signal_); + if (status != HSA_STATUS_SUCCESS) EXC_ABORT(status, "signal_destroy " << std::hex << status); + } + if (read_signal_.handle) { + hsa_status_t status = hsa_signal_destroy(read_signal_); + if (status != HSA_STATUS_SUCCESS) EXC_ABORT(status, "signal_destroy " << std::hex << status); + } } virtual void Insert(const profile_info_t& info) { info_vector_.push_back(info.rinfo); } @@ -143,6 +161,14 @@ class Profile { profile->parameter_count += 1; } + void BarrierPacket(packet_t* packet, const hsa_signal_t& prior_signal) { + hsa_barrier_and_packet_t* barrier = + reinterpret_cast(packet); + barrier->header = HSA_PACKET_TYPE_BARRIER_AND; + if (prior_signal.handle) barrier->dep_signal[0] = prior_signal; // set packet dependency + else barrier->header |= 1 << HSA_PACKET_HEADER_BARRIER; // set barrier bit + } + hsa_status_t Finalize(pkt_vector_t& start_vector, pkt_vector_t& stop_vector, pkt_vector_t& read_vector, bool is_concurrent = false) { if (is_concurrent) SetConcurrent(&profile_); @@ -190,13 +216,32 @@ class Profile { start.completion_signal = dummy_signal; // Set completion signal of read/stop - hsa_signal_t post_signal; - status = hsa_signal_create(1, 0, NULL, &post_signal); + status = hsa_signal_create(1, 0, NULL, &completion_signal_); if (status != HSA_STATUS_SUCCESS) EXC_RAISING(status, "signal_create " << std::hex << status); - stop.completion_signal = post_signal; - read.completion_signal = post_signal; - read2.completion_signal = post_signal; - completion_signal_ = post_signal; + if (is_concurrent) { + status = hsa_signal_create(1, 0, NULL, &read_signal_); + if (status != HSA_STATUS_SUCCESS) EXC_RAISING(status, "signal_create " << std::hex << status); + read.completion_signal = read_signal_; + read2.completion_signal = completion_signal_; + } else { + read.completion_signal = completion_signal_; + } + stop.completion_signal = completion_signal_; + + status = hsa_signal_create(1, 0, NULL, &dispatch_signal_); + if (status != HSA_STATUS_SUCCESS) EXC_RAISING(status, "signal_create " << std::hex << status); + + // Create barrier packets: enforce start to be done first, and further make + // read and read2 finish before and after kernel dispatch, respectively + packet_t barrier_st, barrier_rd{}, barrier_rd2{}; + if (is_concurrent) { + BarrierPacket(&barrier_st, start.completion_signal); + BarrierPacket(&barrier_rd, read.completion_signal); + BarrierPacket(&barrier_rd2, dispatch_signal_); + status = hsa_signal_create(1, 0, NULL, &(barrier_signal_)); + if (status != HSA_STATUS_SUCCESS) EXC_RAISING(status, "signal_create " << std::hex << status); + barrier_rd2.completion_signal = barrier_signal_; + } // Fill packet vectors if (is_legacy_) { @@ -218,7 +263,11 @@ class Profile { if (rd_status == HSA_STATUS_SUCCESS) { pkt_vector_t reads = {read}; - if (is_concurrent) reads.push_back(read2); + if (is_concurrent) { + reads.push_back(barrier_rd); + reads.push_back(barrier_rd2); + reads.push_back(read2); + } for (auto rd : reads) { const uint32_t read_index = read_vector.size(); read_vector.insert(read_vector.end(), LEGACY_SLOT_SIZE_PKT, packet_t{}); @@ -230,11 +279,15 @@ class Profile { } } else { start_vector.push_back(start); + if (is_concurrent) start_vector.push_back(barrier_st); stop_vector.push_back(stop); if (rd_status == HSA_STATUS_SUCCESS) { read_vector.push_back(read); - if (is_concurrent) + if (is_concurrent) { + read_vector.push_back(barrier_rd); + read_vector.push_back(barrier_rd2); read_vector.push_back(read2); + } } } } @@ -244,7 +297,8 @@ class Profile { void GetProfiles(profile_vector_t& vec) { if (!info_vector_.empty()) { - vec.push_back(profile_tuple_t{&profile_, &info_vector_, completion_signal_}); + vec.push_back(profile_tuple_t{&profile_, &info_vector_, completion_signal_, + dispatch_signal_, barrier_signal_, read_signal_}); } } @@ -258,6 +312,9 @@ class Profile { profile_t profile_; info_vector_t info_vector_; hsa_signal_t completion_signal_; + hsa_signal_t dispatch_signal_; + hsa_signal_t barrier_signal_; + hsa_signal_t read_signal_; }; class PmcProfile : public Profile { @@ -280,6 +337,46 @@ class PmcProfile : public Profile { } }; +class TraceProfile : public Profile { + public: + static inline void SetSize(const uint32_t& size) { output_buffer_size_ = size; } + static inline uint32_t GetSize() { return output_buffer_size_; } + static inline void SetLocal(const bool& b) { output_buffer_local_ = b; } + static inline bool IsLocal() { return output_buffer_local_; } + + TraceProfile(const util::AgentInfo* agent_info) : Profile(agent_info) { + profile_.type = HSA_VEN_AMD_AQLPROFILE_EVENT_TYPE_TRACE; + } + + void Insert(const profile_info_t& info) { + if (info.parameters != NULL) { + Profile::Insert(info); + for (unsigned j = 0; j < info.parameter_count; ++j) { + Config(&profile_).Insert(info.parameters[j]); + } + } else if (info.event != NULL) { + Config(&profile_).Insert(*(info.event)); + } else { + EXC_ABORT(HSA_STATUS_ERROR, "invalid trace info inserted"); + } + } + + hsa_status_t Allocate(util::HsaRsrcFactory* rsrc) { + profile_.command_buffer.ptr = + rsrc->AllocateSysMemory(agent_info_, profile_.command_buffer.size); + profile_.output_buffer.size = output_buffer_size_; + profile_.output_buffer.ptr = (output_buffer_local_) ? + rsrc->AllocateLocalMemory(agent_info_, profile_.output_buffer.size) : + rsrc->AllocateSysMemory(agent_info_, profile_.output_buffer.size); + return (profile_.command_buffer.ptr && profile_.output_buffer.ptr) ? HSA_STATUS_SUCCESS + : HSA_STATUS_ERROR; + } + + private: + static uint32_t output_buffer_size_; + static bool output_buffer_local_; +}; + } // namespace rocprofiler #endif // SRC_CORE_PROFILE_H_ diff --git a/src/core/rocprofiler.cpp b/src/core/rocprofiler.cpp index d5af91c2..5d1cd9c7 100644 --- a/src/core/rocprofiler.cpp +++ b/src/core/rocprofiler.cpp @@ -150,20 +150,6 @@ void RestoreHsaApi() { table->amd_ext_->hsa_amd_queue_intercept_register_fn = hsa_amd_queue_intercept_register_fn; } -void PmcStarter(Context* context) { - hsa_agent_t agent = context->GetAgent(); - // Create queue - hsa_queue_t* queue; - hsa_status_t status = rocprofiler::CreateQueuePro(agent, 1, - HSA_QUEUE_TYPE_MULTI, NULL, NULL, UINT32_MAX, UINT32_MAX, &queue); - if (status != HSA_STATUS_SUCCESS) EXC_RAISING(status, "CreateQueuePro"); - HsaQueue hsa_queue(NULL, queue); - context->Start(0, &hsa_queue); - context->Read(0, &hsa_queue); - context->GetData(0); - hsa_queue_destroy(queue); -} - void StandaloneIntercept() { ::HsaApiTable* table = kHsaApiTable; table->core_->hsa_queue_create_fn = rocprofiler::CreateQueuePro; @@ -213,6 +199,8 @@ uint32_t LoadTool() { rocprofiler_settings_t settings{}; settings.intercept_mode = (intercept_mode != 0) ? 1 : 0; + settings.trace_size = TraceProfile::GetSize(); + settings.trace_local = TraceProfile::IsLocal() ? 1: 0; settings.timeout = util::HsaRsrcFactory::GetTimeoutNs(); settings.timestamp_on = InterceptQueue::IsTrackerOn() ? 1 : 0; settings.code_obj_tracking = 1; @@ -220,6 +208,8 @@ uint32_t LoadTool() { if (handler) handler(); else if (handler_prop) handler_prop(&settings); + TraceProfile::SetSize(settings.trace_size); + TraceProfile::SetLocal(settings.trace_local != 0); util::HsaRsrcFactory::SetTimeoutNs(settings.timeout); InterceptQueue::TrackerOn(settings.timestamp_on != 0); if (settings.intercept_mode != 0) intercept_mode = DISPATCH_INTERCEPT_MODE; @@ -237,9 +227,40 @@ uint32_t LoadTool() { return intercept_mode; } +void PmcStopper() { + rocprofiler::util::HsaRsrcFactory* rsrc = &rocprofiler::util::HsaRsrcFactory::Instance(); + + const uint32_t gpu_count = rsrc->GetCountOfGpuAgents(); + for (uint32_t gpu_id = 0; gpu_id < gpu_count; gpu_id++) { + // Get agent info + const rocprofiler::util::AgentInfo* agent_info; + if (rsrc->GetGpuAgentInfo(gpu_id, &agent_info) == false) { + fprintf(stderr, "Error: GetGpuAgentInfo(%u) \n", gpu_id); + abort(); + } + + // Create queue + hsa_queue_t* queue; + hsa_status_t status = rocprofiler::CreateQueuePro(agent_info->dev_id, 1, + HSA_QUEUE_TYPE_MULTI, NULL, NULL, UINT32_MAX, UINT32_MAX, &queue); + if (status != HSA_STATUS_SUCCESS) EXC_RAISING(status, "CreateQueuePro (" + << gpu_id << ") " << std::hex << status); + + // Submit packets + for (auto& pkt: Context::stop_packets_) { + rsrc->Submit(queue, &pkt); + // Wait for stop packet to complete + rsrc->SignalWaitRestore(pkt.completion_signal, 1); + } + + hsa_queue_destroy(queue); + } +} + // Unload profiling tool librray void UnloadTool() { ONLOAD_TRACE("tool handle(" << tool_handle << ")"); + //if (Context::k_concurrent_) PmcStopper(); if (tool_handle) { tool_handler_t handler = reinterpret_cast(dlsym(tool_handle, "OnUnloadTool")); if (handler == NULL) { @@ -433,6 +454,8 @@ hsa_status_t hsa_amd_memory_async_copy_rect_interceptor( } rocprofiler_properties_t rocprofiler_properties; +uint32_t TraceProfile::output_buffer_size_ = 0x2000000; // 32M +bool TraceProfile::output_buffer_local_ = true; std::atomic Tracker::instance_{}; Tracker::mutex_t Tracker::glob_mutex_; Tracker::counter_t Tracker::counter_ = 0; @@ -707,7 +730,10 @@ PUBLIC_API hsa_status_t rocprofiler_stop_queue_callbacks() { // Method for iterating the events output data PUBLIC_API hsa_status_t rocprofiler_iterate_trace_data( rocprofiler_t* handle, hsa_ven_amd_aqlprofile_data_callback_t callback, void* data) { - return HSA_STATUS_ERROR; + API_METHOD_PREFIX + rocprofiler::Context* context = reinterpret_cast(handle); + context->IterateTraceData(callback, data); + API_METHOD_SUFFIX } //////////////////////////////////////////////////////////////////////////////// diff --git a/src/core/tracker.h b/src/core/tracker.h index d538aff7..f98c355e 100644 --- a/src/core/tracker.h +++ b/src/core/tracker.h @@ -62,6 +62,7 @@ class Tracker { void* arg; bool is_context; bool is_memcopy; + bool is_proxy; }; static Tracker* Create() { @@ -88,7 +89,7 @@ class Tracker { } // Add tracker entry - entry_t* Alloc(const hsa_agent_t& agent, const hsa_signal_t& orig) { + entry_t* Alloc(const hsa_agent_t& agent, const hsa_signal_t& orig, bool proxy=true) { hsa_status_t status = HSA_STATUS_ERROR; // Creating a new tracker entry @@ -105,11 +106,14 @@ class Tracker { entry->record = record; // Creating a proxy signal - const hsa_signal_value_t signal_value = (orig.handle) ? hsa_api_.hsa_signal_load_relaxed(orig) : 1; - status = hsa_api_.hsa_signal_create(signal_value, 0, NULL, &(entry->signal)); - if (status != HSA_STATUS_SUCCESS) EXC_RAISING(status, "hsa_signal_create"); - status = hsa_api_.hsa_amd_signal_async_handler(entry->signal, HSA_SIGNAL_CONDITION_LT, signal_value, Handler, entry); - if (status != HSA_STATUS_SUCCESS) EXC_RAISING(status, "hsa_amd_signal_async_handler"); + if (proxy) { + entry->is_proxy = true; + const hsa_signal_value_t signal_value = (orig.handle) ? hsa_api_.hsa_signal_load_relaxed(orig) : 1; + status = hsa_api_.hsa_signal_create(signal_value, 0, NULL, &(entry->signal)); + if (status != HSA_STATUS_SUCCESS) EXC_RAISING(status, "hsa_signal_create"); + status = hsa_api_.hsa_amd_signal_async_handler(entry->signal, HSA_SIGNAL_CONDITION_LT, signal_value, Handler, entry); + if (status != HSA_STATUS_SUCCESS) EXC_RAISING(status, "hsa_amd_signal_async_handler"); + } // Adding antry to the list mutex_.lock(); @@ -120,9 +124,17 @@ class Tracker { return entry; } + void SetHandler(entry_t* entry, Group* group) { + hsa_signal_t& dispatch_signal = group->GetDispatchSignal(); + hsa_signal_t& handler_signal = group->GetBarrierSignal(); + entry->signal = dispatch_signal; + hsa_status_t status = hsa_api_.hsa_amd_signal_async_handler(handler_signal, HSA_SIGNAL_CONDITION_LT, 1, Handler, entry); + if (status != HSA_STATUS_SUCCESS) EXC_RAISING(status, "hsa_amd_signal_async_handler"); + } + // Delete tracker entry void Delete(entry_t* entry) { - hsa_api_.hsa_signal_destroy(entry->signal); + if (entry->is_proxy && entry->signal.handle) hsa_api_.hsa_signal_destroy(entry->signal); mutex_.lock(); sig_list_.erase(entry->it); mutex_.unlock(); @@ -157,14 +169,13 @@ class Tracker { // Enable tracking static void Enable_opt(Group* group, const hsa_signal_t& orig_signal) { - Context* context = group->GetContext(); - context->SetOrigSignal(orig_signal); - context->GetRecord()->dispatch = util::HsaRsrcFactory::Instance().TimestampNs(); + group->SetOrigSignal(orig_signal); + group->GetRecord()->dispatch = util::HsaRsrcFactory::Instance().TimestampNs(); // Creating a proxy signal const hsa_signal_value_t signal_value = (orig_signal.handle) ? util::HsaRsrcFactory::Instance().HsaApi()->hsa_signal_load_relaxed(orig_signal) : 1; - hsa_signal_t& dispatch_signal = context->GetDispatchSignal(); + hsa_signal_t& dispatch_signal = group->GetDispatchSignal(); util::HsaRsrcFactory::Instance().HsaApi()->hsa_signal_store_screlease(dispatch_signal, signal_value); hsa_status_t status = util::HsaRsrcFactory::Instance().HsaApi()->hsa_amd_signal_async_handler(dispatch_signal, HSA_SIGNAL_CONDITION_LT, signal_value, Handler_opt, group); @@ -175,8 +186,8 @@ class Tracker { static bool Handler_opt(hsa_signal_value_t signal_value, void* arg) { Group* group = reinterpret_cast(arg); Context* context = group->GetContext(); - hsa_signal_t dispatch_signal = context->GetDispatchSignal(); - record_t* record = context->GetRecord(); + hsa_signal_t dispatch_signal = group->GetDispatchSignal(); + record_t* record = group->GetRecord(); hsa_amd_profiling_dispatch_time_t dispatch_time{}; hsa_status_t status = util::HsaRsrcFactory::Instance().HsaApi()->hsa_amd_profiling_get_dispatch_time(context->GetAgent(), dispatch_signal, &dispatch_time); @@ -186,7 +197,7 @@ class Tracker { record->complete = util::HsaRsrcFactory::Instance().TimestampNs(); // Original intercepted signal completion - const hsa_signal_t& orig_signal = context->GetOrigSignal(); + const hsa_signal_t& orig_signal = group->GetOrigSignal(); if (orig_signal.handle) { amd_signal_t* orig_signal_ptr = reinterpret_cast(orig_signal.handle); amd_signal_t* prof_signal_ptr = reinterpret_cast(dispatch_signal.handle); diff --git a/src/util/hsa_rsrc_factory.cpp b/src/util/hsa_rsrc_factory.cpp index e2f97ce9..7cbaecc5 100644 --- a/src/util/hsa_rsrc_factory.cpp +++ b/src/util/hsa_rsrc_factory.cpp @@ -24,6 +24,7 @@ POSSIBILITY OF SUCH DAMAGE. #include "util/hsa_rsrc_factory.h" +#include #include #include #include @@ -47,6 +48,13 @@ POSSIBILITY OF SUCH DAMAGE. namespace rocprofiler { namespace util { +// Demangle C++ symbol name +static const char* cpp_demangle(const char* symname) { + size_t size = 0; + int status; + const char* ret = abi::__cxa_demangle(symname, NULL, &size, &status); + return (ret != 0) ? ret : strdup(symname); +} // Callback function to get available in the system agents hsa_status_t HsaRsrcFactory::GetHsaAgentsCallback(hsa_agent_t agent, void* data) { @@ -732,11 +740,12 @@ hsa_status_t HsaRsrcFactory::executable_symbols_cb(hsa_executable_t exec, hsa_ex CHECK_STATUS("Error in getting kernel object", status); status = hsa_api_.hsa_executable_symbol_get_info(symbol, HSA_EXECUTABLE_SYMBOL_INFO_NAME_LENGTH, &len); CHECK_STATUS("Error in getting name len", status); - char *name = new char[len + 1]; - status = hsa_api_.hsa_executable_symbol_get_info(symbol, HSA_EXECUTABLE_SYMBOL_INFO_NAME, name); + char symname[len + 1]; + status = hsa_api_.hsa_executable_symbol_get_info(symbol, HSA_EXECUTABLE_SYMBOL_INFO_NAME, symname); CHECK_STATUS("Error in getting kernel name", status); - name[len] = 0; + symname[len] = 0; if (data == NULL) { + const char* name = cpp_demangle(symname); auto ret = symbols_map_->insert({addr, name}); if (ret.second == false) { delete[] ret.first->second; diff --git a/test/tool/tool.cpp b/test/tool/tool.cpp index 30e35504..4724b87b 100644 --- a/test/tool/tool.cpp +++ b/test/tool/tool.cpp @@ -210,6 +210,14 @@ std::string filtr_kernel_name(const std::string name) { open_token = '>'; close_token = '<'; break; + case ']': + counter = 1; + open_token = ']'; + close_token = '['; + break; + case ' ': + ++rit; + continue; } if (counter == 0) break; } else { @@ -218,9 +226,8 @@ std::string filtr_kernel_name(const std::string name) { } ++rit; } - while (rit != rend) if ((*rit == ' ') || (*rit == ' ')) rit++; else break; auto rbeg = rit; - while (rit != rend) if ((*rit != ' ') && (*rit != ':')) rit++; else break; + while ((rit != rend) && (*rit != ' ') && (*rit != ':')) rit++; const uint32_t pos = rend - rit; const uint32_t length = rit - rbeg; return name.substr(pos, length); @@ -973,6 +980,8 @@ extern "C" PUBLIC_API void OnLoadToolProp(rocprofiler_settings_t* settings) // Set HSA intercepting check_env_var("ROCP_HSA_INTERC", settings->hsa_intercepting); if (settings->hsa_intercepting) rocprofiler_set_hsa_callbacks(hsa_callbacks, (void*)14); + // Enable concurrent SQTT + check_env_var("ROCP_K_CONCURRENT", settings->k_concurrent); // Enable optmized mode check_env_var("ROCP_OPT_MODE", settings->opt_mode); From 2ae6abd151ebf0a74596715dfc852856f4c1aeb7 Mon Sep 17 00:00:00 2001 From: eshcherb <33529668+eshcherb@users.noreply.github.com> Date: Thu, 15 Oct 2020 12:46:08 -0500 Subject: [PATCH 128/153] Update README.md --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 9108409f..4b093e4b 100644 --- a/README.md +++ b/README.md @@ -157,7 +157,7 @@ Options: --ctx-limit - maximum number of outstanding contexts [0 - unlimited] --heartbeat - to print progress heartbeats [0 - disabled] - --obj-tracking - to turn on/off kernels code objects tracking [off] + --obj-tracking - to turn on/off kernels code objects tracking [on] To support V3 code-object. --stats - generating kernel execution stats, file .stats.csv From 0fb8713913b95bd5c51879b6239709748eb583fc Mon Sep 17 00:00:00 2001 From: eshcherb <33529668+eshcherb@users.noreply.github.com> Date: Thu, 15 Oct 2020 12:48:35 -0500 Subject: [PATCH 129/153] Update README.md --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 4b093e4b..91361b56 100644 --- a/README.md +++ b/README.md @@ -191,6 +191,6 @@ Configuration file: timestamp=off ctx-limit=0 heartbeat=0 - obj-tracking=off + obj-tracking=on > ``` From 73a8c80c3196e01f4471c403c515973793e1e922 Mon Sep 17 00:00:00 2001 From: "Wen-Heng (Jack) Chung" Date: Fri, 23 Oct 2020 12:16:52 -0500 Subject: [PATCH 130/153] Add rocminfo in the binary package dependency. --- CMakeLists.txt | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index e6765e47..4584e914 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -176,11 +176,11 @@ set ( CPACK_PACKAGE_DESCRIPTION_SUMMARY "ROCPROFILER library for AMD HSA runtime set ( CPACK_RESOURCE_FILE_LICENSE "${CMAKE_CURRENT_SOURCE_DIR}/LICENSE" ) ## Debian package specific variables -set ( CPACK_DEBIAN_PACKAGE_DEPENDS "hsa-rocr-dev" ) +set ( CPACK_DEBIAN_PACKAGE_DEPENDS "hsa-rocr-dev, rocminfo" ) set ( CPACK_DEBIAN_PACKAGE_CONTROL_EXTRA "${CMAKE_CURRENT_SOURCE_DIR}/DEBIAN/postinst;${CMAKE_CURRENT_SOURCE_DIR}/DEBIAN/prerm" ) ## RPM package specific variables -set ( CPACK_RPM_PACKAGE_DEPENDS "hsa-rocr-dev" ) +set ( CPACK_RPM_PACKAGE_DEPENDS "hsa-rocr-dev, rocminfo" ) set ( CPACK_RPM_POST_INSTALL_SCRIPT_FILE "${CMAKE_CURRENT_SOURCE_DIR}/RPM/rpm_post" ) set ( CPACK_RPM_POST_UNINSTALL_SCRIPT_FILE "${CMAKE_CURRENT_SOURCE_DIR}/RPM/rpm_postun" ) From 7df95623af3b732b27eddf11696b235b72ce5fee Mon Sep 17 00:00:00 2001 From: eshcherb <33529668+eshcherb@users.noreply.github.com> Date: Wed, 28 Oct 2020 11:22:09 -0500 Subject: [PATCH 131/153] Update rocprofiler_spec.md --- doc/rocprofiler_spec.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/rocprofiler_spec.md b/doc/rocprofiler_spec.md index 25e61df7..75ab58b7 100644 --- a/doc/rocprofiler_spec.md +++ b/doc/rocprofiler_spec.md @@ -635,7 +635,7 @@ example but in SINGLEGROUP mode when only one group is allowed the context handl saved and then direct context method rocprofiler_get_data with default group index equal to 0 can be used. -hsa_status_t_dispatch_callback( +hsa_status_t dispatch_callback( const rocprofiler_callback_data_t* callback_data, void* user_data, rocprofiler_group_t* group) From 759f081cf325bc3e1597b97eacb7ed6d7303fe60 Mon Sep 17 00:00:00 2001 From: Evgeny Date: Wed, 28 Oct 2020 17:59:40 -0500 Subject: [PATCH 132/153] 3.9 update --- bin/mem_manager.py | 11 +++++------ bin/sqlitedb.py | 1 + bin/tblextr.py | 3 ++- test/tool/tool.cpp | 2 +- 4 files changed, 9 insertions(+), 8 deletions(-) diff --git a/bin/mem_manager.py b/bin/mem_manager.py index 8b616cc6..e87c4bca 100755 --- a/bin/mem_manager.py +++ b/bin/mem_manager.py @@ -188,17 +188,15 @@ def add_memcpy(self, recvals): m = hipMemcpy_ptrn_kind.match(args) if m: - direction = switcher.get(m.group(1), "unknown") + direction = switcher.get(m.group(1), "unknown") copy_line = str(start_time) + DELIM + str(end_time) + DELIM + pid + DELIM + tid + DELIM + event + DELIM + 'Direction=' + direction + DELIM + 'SrcType=' + srcptr_type + DELIM + 'DstType=' + dstptr_type + DELIM + "Size=" + str(size) + DELIM + "BW=" + str(round(bandwidth, 2)) + DELIM + 'Async=' + str(is_async) self.memcopies[recordid] = copy_line return copy_line; - def dump_data(self): - # To create “MM” table in DB on the finish - table_name = "MM" - file_name = os.environ['PWD'] + '/results.memcopy_info.csv' + def dump_data(self, table_name, file_name): + # To create memcopy info table in DB print("File '" + file_name + "' is generating") table_handle = self.db.add_table(table_name, mm_table_descr) @@ -212,5 +210,6 @@ def dump_data(self): else: rec_vals_array.append(rec) self.db.insert_entry(table_handle, rec_vals_array) - # To dump the MM table as CSV + + # To dump the memcopy info table as CSV self.db.dump_csv(table_name, file_name) diff --git a/bin/sqlitedb.py b/bin/sqlitedb.py index 62553a81..7aae7c17 100644 --- a/bin/sqlitedb.py +++ b/bin/sqlitedb.py @@ -233,6 +233,7 @@ def commit(self): # close DB def close(self): + self.connection.commit() self.connection.close() # access DB diff --git a/bin/tblextr.py b/bin/tblextr.py index 1b39a415..4ccc92e9 100755 --- a/bin/tblextr.py +++ b/bin/tblextr.py @@ -672,6 +672,7 @@ def fill_ops_db(kernel_table_name, mcopy_table_name, db, indir): kfd_statfile = re.sub(r'\.stats\.csv$', r'.kfd_stats.csv', statfile) ops_statfile = statfile copy_statfile = re.sub(r'\.stats\.csv$', r'.copy_stats.csv', statfile) + memcopy_info_file = re.sub(r'\.stats\.csv$', r'.memcopy_info.csv', statfile) sysinfo_file = re.sub(r'\.stats\.csv$', r'.sysinfo.txt', statfile) metadata_gen(sysinfo_file, 'rocminfo') @@ -778,7 +779,7 @@ def fill_ops_db(kernel_table_name, mcopy_table_name, db, indir): db.close_json(jsonfile); if mcopy_data_enabled: - memory_manager.dump_data() + memory_manager.dump_data('MM', memcopy_info_file) db.close() diff --git a/test/tool/tool.cpp b/test/tool/tool.cpp index 4724b87b..34650a34 100644 --- a/test/tool/tool.cpp +++ b/test/tool/tool.cpp @@ -980,7 +980,7 @@ extern "C" PUBLIC_API void OnLoadToolProp(rocprofiler_settings_t* settings) // Set HSA intercepting check_env_var("ROCP_HSA_INTERC", settings->hsa_intercepting); if (settings->hsa_intercepting) rocprofiler_set_hsa_callbacks(hsa_callbacks, (void*)14); - // Enable concurrent SQTT + // Enable concurrent mode check_env_var("ROCP_K_CONCURRENT", settings->k_concurrent); // Enable optmized mode check_env_var("ROCP_OPT_MODE", settings->opt_mode); From 777925295d996313cdeb704551dbd5cad31f5a61 Mon Sep 17 00:00:00 2001 From: Evgeny Date: Mon, 30 Nov 2020 00:48:16 -0600 Subject: [PATCH 133/153] 3.10 update --- CMakeLists.txt | 10 ++- bin/dform.py | 16 ++-- bin/mem_manager.py | 67 ++++++++------- bin/rpl_run.sh | 1 + bin/sqlitedb.py | 5 +- bin/tblextr.py | 100 ++++++++++++--------- inc/rocprofiler.h | 19 +++- src/CMakeLists.txt | 2 +- src/core/activity.cpp | 172 +++++++++++++++++++++++++++++++++++++ src/core/activity.h | 26 ++++++ src/core/hsa_interceptor.h | 140 +++++++++++++++++++++++------- src/core/rocprofiler.cpp | 4 +- test/tool/tool.cpp | 43 ++++++++-- 13 files changed, 478 insertions(+), 127 deletions(-) create mode 100644 src/core/activity.h diff --git a/CMakeLists.txt b/CMakeLists.txt index e6765e47..c5de434d 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -135,8 +135,14 @@ add_custom_target ( so-link ALL WORKING_DIRECTORY ${PROJECT_BINARY_DIR} # Install header and library install ( TARGETS ${ROCPROFILER_TARGET} LIBRARY DESTINATION ${DEST_NAME}/lib ) -install ( FILES ${CMAKE_CURRENT_SOURCE_DIR}/inc/rocprofiler.h DESTINATION ${DEST_NAME}/include ) -install ( FILES ${CMAKE_CURRENT_SOURCE_DIR}/inc/rocprofiler.h DESTINATION include/${DEST_NAME} ) +install ( FILES + ${CMAKE_CURRENT_SOURCE_DIR}/inc/rocprofiler.h + ${CMAKE_CURRENT_SOURCE_DIR}/src/core/activity.h + DESTINATION ${DEST_NAME}/include ) +install ( FILES + ${CMAKE_CURRENT_SOURCE_DIR}/inc/rocprofiler.h + ${CMAKE_CURRENT_SOURCE_DIR}/src/core/activity.h + DESTINATION include/${DEST_NAME} ) # rpl_run.sh tblextr.py txt2xml.sh install ( FILES ${CMAKE_CURRENT_SOURCE_DIR}/bin/rpl_run.sh diff --git a/bin/dform.py b/bin/dform.py index 82a81d08..1ed78d8f 100644 --- a/bin/dform.py +++ b/bin/dform.py @@ -46,26 +46,26 @@ def gen_table_bins(db, table, outfile, name_var, dur_ns_var): db.execute('DROP VIEW B') gen_message(outfile) -def gen_api_json_trace(db, table, start_us, outfile): - db.execute('create view B as select "Index", Name as name, pid, tid, (BeginNs/1000 - %d) as ts, (DurationNs/1000) as dur from %s;' % (start_us, table)); +def gen_api_json_trace(db, table, start_ns, outfile): + db.execute('create view B as select "Index", Name as name, pid, tid, ((BeginNs - %d)/1000) as ts, (DurationNs/1000) as dur from %s;' % (start_ns, table)); db.dump_json('B', table, outfile) db.execute('DROP VIEW B') gen_message(outfile) -def gen_ext_json_trace(db, table, start_us, outfile): - db.execute('create view B as select Name as name, pid, tid, (BeginNs/1000 - %d) as ts, ((EndNs - BeginNs)/1000) as dur from %s;' % (start_us, table)); +def gen_ext_json_trace(db, table, start_ns, outfile): + db.execute('create view B as select Name as name, pid, tid, ((BeginNs - %d)/1000) as ts, ((EndNs - BeginNs)/1000) as dur from %s;' % (start_ns, table)); db.dump_json('B', table, outfile) db.execute('DROP VIEW B') gen_message(outfile) -def gen_ops_json_trace(db, table, base_pid, start_us, outfile): - db.execute('create view B as select "Index", Name as name, ("dev-id" + %d) as pid, tid, (BeginNs/1000 - %d) as ts, (DurationNs/1000) as dur from %s;' % (base_pid, start_us, table)); +def gen_ops_json_trace(db, table, base_pid, start_ns, outfile): + db.execute('create view B as select "Index", Name as name, ("dev-id" + %d) as pid, tid, ((BeginNs - %d)/1000) as ts, (DurationNs/1000) as dur from %s;' % (base_pid, start_ns, table)); db.dump_json('B', table, outfile) db.execute('DROP VIEW B') gen_message(outfile) -def gen_kernel_json_trace(db, table, base_pid, start_us, outfile): - db.execute('create view B as select "Index", KernelName as name, ("gpu-id" + %d) as pid, (0) as tid, (BeginNs/1000 - %d) as ts, (DurationNs/1000) as dur from %s;' % (base_pid, start_us, table)); +def gen_kernel_json_trace(db, table, base_pid, start_ns, outfile): + db.execute('create view B as select "Index", KernelName as name, ("gpu-id" + %d) as pid, (0) as tid, ((BeginNs - %d)/1000) as ts, (DurationNs/1000) as dur from %s;' % (base_pid, start_ns, table)); db.dump_json('B', table, outfile) db.execute('DROP VIEW B') gen_message(outfile) diff --git a/bin/mem_manager.py b/bin/mem_manager.py index e87c4bca..8480063c 100755 --- a/bin/mem_manager.py +++ b/bin/mem_manager.py @@ -124,13 +124,13 @@ def add_memcpy(self, recvals): select_expr = '"Index" = ' + str(recordid) + ' AND "proc-id" = ' + str(procid) # hipMemcpy(void* dst, const void* src, size_t sizeBytes, hipMemcpyKind kind) - hipMemcpy_ptrn = re.compile(r'\(dst\((.*)\) src\((.*)\) sizeBytes\((\d+)\).*\)') + hipMemcpy_ptrn = re.compile(r'\(\s*dst\((.*)\) src\((.*)\) sizeBytes\((\d+)\).*\)') # hipMemcpy2D(void* dst, size_t dpitch, const void* src, size_t spitch, size_t width, # size_t height, hipMemcpyKind kind); - hipMemcpy_ptrn2 = re.compile(r'\(dst\((.*)\) .* src\((.*)\) .* width\((\d+)\) height\((\d+)\).*\)') + hipMemcpy_ptrn2 = re.compile(r'\(\s*dst\((.*)\) .* src\((.*)\) .* width\((\d+)\) height\((\d+)\).*\)') # hipMemcpyToArray(hipArray* dst, size_t wOffset, size_t hOffset, const void* src, # size_t count, hipMemcpyKind kind); - hipMemcpy_ptrn3 = re.compile(r'\(dst\((.*)\) .* src\((.*)\) count\((\d+)\).*\)') + hipMemcpy_ptrn3 = re.compile(r'\(\s*dst\((.*)\) .* src\((.*)\) count\((\d+)\).*\)') # memcopy with kind argument hipMemcpy_ptrn_kind = re.compile(r'.* kind\((\d+)\)\s*.*') # aysnc memcopy @@ -163,34 +163,39 @@ def add_memcpy(self, recvals): '4': "auto", } - if m_basic or m_2d or m_array: - if m_basic: - dstptr = m_basic.group(1) - dstptr_type = self.get_ptr_type(dstptr) - srcptr = m_basic.group(2) - srcptr_type = self.get_ptr_type(srcptr) - size = int(m_basic.group(3)) - if m_array: - dstptr = m_array.group(1) - dstptr_type = self.get_ptr_type(dstptr) - srcptr = m_array.group(2) - srcptr_type = self.get_ptr_type(srcptr) - size = m_array.group(3) - if m_2d: - dstptr = m_2d.group(1) - dstptr_type = self.get_ptr_type(dstptr) - srcptr = m_2d.group(2) - srcptr_type = self.get_ptr_type(srcptr) - size = m_2d.group(3)*m_2d.group(4) - - duration = (int(end_time) - int(start_time)) if not is_async else (int(async_copy_end_time) - int(async_copy_start_time)) - bandwidth = float(size) * 1000 / duration - - m = hipMemcpy_ptrn_kind.match(args) - if m: - direction = switcher.get(m.group(1), "unknown") - - copy_line = str(start_time) + DELIM + str(end_time) + DELIM + pid + DELIM + tid + DELIM + event + DELIM + 'Direction=' + direction + DELIM + 'SrcType=' + srcptr_type + DELIM + 'DstType=' + dstptr_type + DELIM + "Size=" + str(size) + DELIM + "BW=" + str(round(bandwidth, 2)) + DELIM + 'Async=' + str(is_async) + condition_matched = False + if m_basic: + dstptr = m_basic.group(1) + dstptr_type = self.get_ptr_type(dstptr) + srcptr = m_basic.group(2) + srcptr_type = self.get_ptr_type(srcptr) + size = int(m_basic.group(3)) + condition_matched = True + if m_array: + dstptr = m_array.group(1) + dstptr_type = self.get_ptr_type(dstptr) + srcptr = m_array.group(2) + srcptr_type = self.get_ptr_type(srcptr) + size = m_array.group(3) + condition_matched = True + if m_2d: + dstptr = m_2d.group(1) + dstptr_type = self.get_ptr_type(dstptr) + srcptr = m_2d.group(2) + srcptr_type = self.get_ptr_type(srcptr) + size = m_2d.group(3)*m_2d.group(4) + condition_matched = True + + if not condition_matched: fatal('Memcpy args \"' + args + '\" cannot be identified') + + duration = (int(end_time) - int(start_time)) if not is_async else (int(async_copy_end_time) - int(async_copy_start_time)) + bandwidth = float(size) * 1000 / duration + + m = hipMemcpy_ptrn_kind.match(args) + if m: + direction = switcher.get(m.group(1), "unknown") + + copy_line = str(start_time) + DELIM + str(end_time) + DELIM + pid + DELIM + tid + DELIM + event + DELIM + 'Direction=' + direction + DELIM + 'SrcType=' + srcptr_type + DELIM + 'DstType=' + dstptr_type + DELIM + "Size=" + str(size) + DELIM + "BW=" + str(round(bandwidth, 2)) + DELIM + 'Async=' + str(is_async) self.memcopies[recordid] = copy_line return copy_line; diff --git a/bin/rpl_run.sh b/bin/rpl_run.sh index f45b8312..6d66405d 100755 --- a/bin/rpl_run.sh +++ b/bin/rpl_run.sh @@ -443,6 +443,7 @@ while [ 1 ] ; do elif [ "$1" = "--parallel-kernels" ] ; then ARG_VAL=0 export ROCP_K_CONCURRENT=1 + export AQLPROFILE_READ_API=1 elif [ "$1" = "--verbose" ] ; then ARG_VAL=0 export ROCP_VERBOSE_MODE=1 diff --git a/bin/sqlitedb.py b/bin/sqlitedb.py index 7aae7c17..50adb698 100644 --- a/bin/sqlitedb.py +++ b/bin/sqlitedb.py @@ -151,7 +151,7 @@ def label_json(self, pid, label, file_name): fd.write(',{"args":{"name":"%s"},"ph":"M","pid":%s,"name":"process_name","sort_index":%d}\n' %(label, pid, self.section_index)) self.section_index += 1 - def flow_json(self, base_id, from_pid, from_us_list, to_pid, to_us_dict, corr_id_list, start_us, file_name): + def flow_json(self, base_id, from_pid, from_us_list, to_pid, to_us_dict, corr_id_list, file_name): if not re.search(r'\.json$', file_name): raise Exception('wrong output file type: "' + file_name + '"' ) with open(file_name, mode='a') as fd: @@ -160,8 +160,7 @@ def flow_json(self, base_id, from_pid, from_us_list, to_pid, to_us_dict, corr_id corr_id = corr_id_list[ind] if (len(corr_id_list) != 0) else ind if corr_id in to_us_dict: (from_ts, from_tid, to_tid) = from_us_list[ind] - from_ts -= start_us - to_ts = to_us_dict[corr_id] - start_us + to_ts = to_us_dict[corr_id] if from_ts > to_ts: from_ts = to_ts fd.write(',{"ts":%d,"ph":"s","cat":"DataFlow","id":%d,"pid":%d,"tid":%d,"name":"dep"}\n' % (from_ts, dep_id, from_pid, from_tid)) fd.write(',{"ts":%d,"ph":"t","cat":"DataFlow","id":%d,"pid":%d,"tid":%d,"name":"dep"}\n' % (to_ts, dep_id, to_pid, to_tid)) diff --git a/bin/tblextr.py b/bin/tblextr.py index 4ccc92e9..ce1ff348 100755 --- a/bin/tblextr.py +++ b/bin/tblextr.py @@ -37,7 +37,7 @@ NONE_PID = -1 max_gpu_id = -1 -START_US = 0 +START_NS = 0 hsa_activity_found = 0 @@ -164,9 +164,10 @@ def parse_res(infile): var_table[dispatch_number]['CompleteNs'] = m.group(4) ## filling dependenciws - from_ns = m.group(1) - from_us = int(from_ns) / 1000 - to_us = int(m.group(2)) / 1000 + from_ns = int(m.group(1)) + to_ns = int(m.group(2)) + from_us = int((from_ns - START_NS) / 1000) + to_us = int((to_ns - START_NS) / 1000) kern_dep_list.append((from_ns, disp_pid, disp_tid)) @@ -346,23 +347,19 @@ def fill_api_db(table_name, db, indir, api_name, api_pid, dep_pid, dep_list, dep if (hsa_activity_found): copy_raws = db.table_get_raws('COPY') copy_csv = '' copy_index = 0 + op_found = 0 file_name = indir + '/' + api_name + '_api_trace.txt' ptrn_val = re.compile(r'(\d+):(\d+) (\d+):(\d+) ([^\(]+)(\(.*)$') hip_mcopy_ptrn = re.compile(r'hipMemcpy') ptrn_ac = re.compile(r'hsa_amd_memory_async_copy') - ptrn1_kernel = re.compile(r'^.*kernel\(') - ptrn2_kernel = re.compile(r'\)\) .*$') ptrn_fixformat = re.compile(r'(\d+:\d+ \d+:\d+ \w+)\(\s*(.*)\)$') ptrn_fixkernel = re.compile(r'\s+kernel=(.*)$') ptrn_multi_kernel = re.compile(r'(.*):(\d+)$') + ptrn_corr_id = re.compile(r'\ :(\d*)$') if not os.path.isfile(file_name): return 0 - dep_tid_list = [] - dep_from_us_list = [] - dep_id_list = [] - # parsing an input trace file and creating a DB table record_id_dict = {} table_handle = db.add_table(table_name, api_table_descr) @@ -379,6 +376,12 @@ def fill_api_db(table_name, db, indir, api_name, api_pid, dep_pid, dep_list, dep record = line[:-1] + corr_id = 0 + m = ptrn_corr_id.search(record) + if m: + corr_id = int(m.group(1)) + record = ptrn_corr_id.sub('', record) + kernel_arg = '' m = ptrn_fixkernel.search(record) if m: @@ -404,29 +407,50 @@ def fill_api_db(table_name, db, indir, api_name, api_pid, dep_pid, dep_list, dep # incrementing per-process record id/correlation id if not proc_id in record_id_dict: record_id_dict[proc_id] = 0 - corr_id = record_id_dict[proc_id] record_id_dict[proc_id] += 1 + record_id = record_id_dict[proc_id] + + # setting correlationid to record id if correlation id is not defined + if corr_id == 0: corr_id = record_id + rec_vals.append(corr_id) # extracting/converting stream id (stream_id, stream_found) = get_field(record_args, 'stream') - if stream_found != 0: + if stream_found == 0: + stream_id = 0 + else: stream_id = get_stream_index(stream_id) (rec_vals[5], found) = set_field(record_args, 'stream', stream_id) if found == 0: fatal('set_field() failed for "stream", args: "' + record_args + '"') - else: stream_id = 0 # extract kernel name string (kernel_str, kernel_found) = get_field(record_args, 'kernel') + if kernel_found == 0: kernel_str = '' + else: op_found = 1 if stream_found != 0 or kernel_found != 0: - ops_patch_data[(corr_id, proc_id)] = (stream_id if stream_found else 0, kernel_str if kernel_found else '') + ops_patch_data[(corr_id, proc_id)] = (stream_id, kernel_str) # dependencies filling if ptrn_ac.match(record_name) or hip_mcopy_ptrn.match(record_name): + op_found = 1 + + # memcopy data + if len(copy_raws) != 0: + copy_data = list(copy_raws[copy_index]) + args_str = rec_vals[5] + args_str = re.sub(r'\(', r'', args_str) + args_str = re.sub(r'\).*$', r'', args_str) + copy_line = str(copy_data[0]) + ', ' + str(copy_data[1]) + ', ' + record_name + ', ' + args_str + copy_csv += str(copy_index) + ', ' + copy_line + '\n' + copy_index += 1 + + if op_found: beg_ns = int(rec_vals[0]) end_ns = int(rec_vals[1]) - from_us = end_ns / 1000 + dur_us = int((end_ns - beg_ns) / 1000) + from_us = int((beg_ns - START_NS) / 1000) + dur_us if not proc_id in dep_dict: dep_dict[proc_id] = {} dep_proc = dep_dict[proc_id] @@ -439,16 +463,6 @@ def fill_api_db(table_name, db, indir, api_name, api_pid, dep_pid, dep_list, dep dep_str['from'].append((from_us, thrd_id, stream_id)) if expl_id: dep_str['id'].append(corr_id) - # memcopy data - if len(copy_raws) != 0: - copy_data = list(copy_raws[copy_index]) - args_str = rec_vals[5] - args_str = re.sub(r'\(', r'', args_str) - args_str = re.sub(r'\).*$', r'', args_str) - copy_line = str(copy_data[0]) + ', ' + str(copy_data[1]) + ', ' + record_name + ', ' + args_str - copy_csv += str(copy_index) + ', ' + copy_line + '\n' - copy_index += 1 - if False: # patching activity properties: kernel name, stream-id if (corr_id, proc_id) in dep_filtr: @@ -478,8 +492,8 @@ def fill_api_db(table_name, db, indir, api_name, api_pid, dep_pid, dep_list, dep # inserting of dispatch events correlated to the dependent dispatches for (from_ns, proc_id, thrd_id) in dep_list: if not proc_id in record_id_dict: record_id_dict[proc_id] = 0 - corr_id = record_id_dict[proc_id] record_id_dict[proc_id] += 1 + corr_id = record_id_dict[proc_id] db.insert_entry(table_handle, [from_ns, from_ns, api_pid, thrd_id, 'hsa_dispatch', '', corr_id, '']) # generating memcopy CSV @@ -526,11 +540,14 @@ def fill_copy_db(table_name, db, indir): db.insert_entry(table_handle, rec_vals) # filling dependencies + to_ns = int(rec_vals[0]) + to_us = int((to_ns - START_NS) / 1000) + if not proc_id in dep_dict: dep_dict[proc_id] = {} dep_proc = dep_dict[proc_id] if not pid in dep_proc: dep_proc[pid] = { 'pid': HSA_PID, 'from': [], 'to': {}, 'id': [] } dep_str = dep_proc[pid] - dep_str['to'][corr_id] = int(rec_vals[0]) / 1000 + dep_str['to'][corr_id] = to_us else: fatal("async-copy bad record: '" + record + "'") @@ -577,7 +594,7 @@ def fill_ops_db(kernel_table_name, mcopy_table_name, db, indir): m = ptrn_id.match(label) if not m: fatal("bad hcc ops entry '" + record + "'") name = m.group(1) - corr_id = int(m.group(2)) - 1 + corr_id = int(m.group(2)) proc_id = int(m.group(3)) # checking name for memcopy pattern @@ -616,11 +633,14 @@ def fill_ops_db(kernel_table_name, mcopy_table_name, db, indir): filtr[(corr_id, proc_id)] = rec_table_name # filling a dependencies + to_ns = int(rec_vals[0]) + to_us = int((to_ns - START_NS) / 1000) + if not proc_id in dep_dict: dep_dict[proc_id] = {} dep_proc = dep_dict[proc_id] if not pid in dep_proc: dep_proc[pid] = { 'bsp': OPS_PID, 'to': {} } dep_str = dep_proc[pid] - dep_str['to'][corr_id] = int(rec_vals[0]) / 1000 + dep_str['to'][corr_id] = to_us else: fatal("hcc ops bad record: '" + record + "'") @@ -644,10 +664,10 @@ def fill_ops_db(kernel_table_name, mcopy_table_name, db, indir): with open(begin_ts_file, mode='r') as fd: ind = 0 for line in fd.readlines(): - val = int(line) / 1000 - if ind == 0 or val < START_US: START_US = val + val = int(line) + if ind == 0 or val < START_NS: START_NS = val ind += 1 - print('START timestamp found (' + str(START_US) + 'us)') + print('START timestamp found (' + str(START_NS) + 'ns)') if re.search(r'\.csv$', outfile): csvfile = outfile @@ -718,38 +738,38 @@ def fill_ops_db(kernel_table_name, mcopy_table_name, db, indir): db.label_json(int(ind) + int(GPU_BASE_PID), "GPU" + str(ind), jsonfile) if ext_trace_found: - dform.gen_ext_json_trace(db, 'rocTX', START_US, jsonfile) + dform.gen_ext_json_trace(db, 'rocTX', START_NS, jsonfile) if len(var_table) != 0: dform.post_process_data(db, 'A', csvfile) dform.gen_table_bins(db, 'A', statfile, 'KernelName', 'DurationNs') if hsa_trace_found and 'BeginNs' in var_list: - dform.gen_kernel_json_trace(db, 'A', GPU_BASE_PID, START_US, jsonfile) + dform.gen_kernel_json_trace(db, 'A', GPU_BASE_PID, START_NS, jsonfile) if hsa_trace_found: dform.post_process_data(db, 'HSA') dform.gen_table_bins(db, 'HSA', hsa_statfile, 'Name', 'DurationNs') - dform.gen_api_json_trace(db, 'HSA', START_US, jsonfile) + dform.gen_api_json_trace(db, 'HSA', START_NS, jsonfile) if copy_trace_found: dform.post_process_data(db, 'COPY') dform.gen_table_bins(db, 'COPY', copy_statfile, 'Name', 'DurationNs') - dform.gen_api_json_trace(db, 'COPY', START_US, jsonfile) + dform.gen_api_json_trace(db, 'COPY', START_NS, jsonfile) if hip_trace_found: dform.post_process_data(db, 'HIP') dform.gen_table_bins(db, 'HIP', hip_statfile, 'Name', 'DurationNs') - dform.gen_api_json_trace(db, 'HIP', START_US, jsonfile) + dform.gen_api_json_trace(db, 'HIP', START_NS, jsonfile) if ops_filtr: dform.post_process_data(db, 'OPS') dform.gen_table_bins(db, 'OPS', ops_statfile, 'Name', 'DurationNs') - dform.gen_ops_json_trace(db, 'OPS', GPU_BASE_PID, START_US, jsonfile) + dform.gen_ops_json_trace(db, 'OPS', GPU_BASE_PID, START_NS, jsonfile) if kfd_trace_found: dform.post_process_data(db, 'KFD') dform.gen_table_bins(db, 'KFD', kfd_statfile, 'Name', 'DurationNs') - dform.gen_api_json_trace(db, 'KFD', START_US, jsonfile) + dform.gen_api_json_trace(db, 'KFD', START_NS, jsonfile) if any_trace_found: dep_id = 0 @@ -771,7 +791,7 @@ def fill_ops_db(kernel_table_name, mcopy_table_name, db, indir): to_us_dict = dep_str['to'] corr_id_list = dep_str['id'] - db.flow_json(dep_id, from_pid, from_us_list, to_pid, to_us_dict, corr_id_list, START_US, jsonfile) + db.flow_json(dep_id, from_pid, from_us_list, to_pid, to_us_dict, corr_id_list, jsonfile) dep_id += len(from_us_list) if any_trace_found: diff --git a/inc/rocprofiler.h b/inc/rocprofiler.h index b176cadf..4a966190 100644 --- a/inc/rocprofiler.h +++ b/inc/rocprofiler.h @@ -74,6 +74,7 @@ typedef struct { uint32_t hsa_intercepting; uint32_t k_concurrent; uint32_t opt_mode; + uint32_t obj_dumping; } rocprofiler_settings_t; //////////////////////////////////////////////////////////////////////////////// @@ -481,7 +482,8 @@ typedef enum { ROCPROFILER_HSA_CB_ID_DEVICE = 1, // Device assign callback ROCPROFILER_HSA_CB_ID_MEMCOPY = 2, // Memcopy callback ROCPROFILER_HSA_CB_ID_SUBMIT = 3, // Packet submit callback - ROCPROFILER_HSA_CB_ID_KSYMBOL = 4 // Loading/unloading of kernel symbol + ROCPROFILER_HSA_CB_ID_KSYMBOL = 4, // Loading/unloading of kernel symbol + ROCPROFILER_HSA_CB_ID_CODEOBJ = 5 // Loading/unloading of kernel symbol } rocprofiler_hsa_cb_id_t; // HSA callback data type @@ -516,8 +518,20 @@ typedef struct { uint64_t object; // kernel symbol object const char* name; // kernel symbol name uint32_t name_length; // kernel symbol name length - int destroy; // symbol executable destroy + int unload; // symbol executable destroy } ksymbol; + struct { + uint32_t storage_type; // code object storage type + int storage_file; // origin file descriptor + uint64_t memory_base; // origin memory base + uint64_t memory_size; // origin memory size + uint64_t load_base; // codeobj load base + uint64_t load_size; // codeobj load size + uint64_t load_delta; // codeobj load size + uint32_t uri_length; // URI string length + char* uri; // URI string + int unload; // unload flag + } codeobj; }; } rocprofiler_hsa_callback_data_t; @@ -534,6 +548,7 @@ typedef struct { rocprofiler_hsa_callback_fun_t memcopy; // memory copy callback rocprofiler_hsa_callback_fun_t submit; // packet submit callback rocprofiler_hsa_callback_fun_t ksymbol; // kernel symbol callback + rocprofiler_hsa_callback_fun_t codeobj; // codeobject load/unload callback } rocprofiler_hsa_callbacks_t; // Set callbacks. If the callback is NULL then it is disabled. diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 4c97ea6f..dbe00cd9 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -35,4 +35,4 @@ set ( LIB_SRC ) add_library ( ${TARGET_LIB} SHARED ${LIB_SRC} ) target_include_directories ( ${TARGET_LIB} PRIVATE ${LIB_DIR} ${ROOT_DIR} ${HSA_RUNTIME_INC_PATH} ${HSA_KMT_LIB_PATH}/.. ) -target_link_libraries( ${TARGET_LIB} PRIVATE ${HSA_RUNTIME_LIB} c stdc++) +target_link_libraries( ${TARGET_LIB} PRIVATE ${HSA_RUNTIME_LIB} c stdc++ ) diff --git a/src/core/activity.cpp b/src/core/activity.cpp index 19f6bea3..2071b5cf 100644 --- a/src/core/activity.cpp +++ b/src/core/activity.cpp @@ -20,6 +20,9 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. *******************************************************************************/ +#define ROCP_INTERNAL_BUILD +#include "activity.h" + #include #include #include @@ -55,6 +58,92 @@ void check_status(hsa_status_t status) { } } +// Activity primitives +namespace activity_prim { +// PC sampling callback data +struct pcsmp_callback_data_t { + const char* kernel_name; // sampled kernel name + void* data_buffer; // host buffer for tracing data + uint64_t id; // sample id + uint64_t cycle; // sample cycle + uint64_t pc; // sample PC +}; + +uint32_t activity_op = UINT32_MAX; +void* activity_arg = NULL; +std::atomic activity_callback{NULL}; +rocprofiler_t* context = NULL; + +hsa_status_t trace_data_cb(hsa_ven_amd_aqlprofile_info_type_t info_type, + hsa_ven_amd_aqlprofile_info_data_t* info_data, + void* data) { + const pcsmp_callback_data_t* pcsmp_data = (pcsmp_callback_data_t*) data; + + activity_record_t record{}; + record.op = activity_op; + record.pc_sample.se = pcsmp_data->id; + record.pc_sample.cycle = pcsmp_data->cycle; + record.pc_sample.pc = pcsmp_data->pc; + activity_async_callback_t fun = activity_callback.load(std::memory_order_acquire); + if (fun) { + (fun)(activity_op, &record, activity_arg); + } else { + free((void*)(pcsmp_data->kernel_name)); + } + return HSA_STATUS_SUCCESS; +} + +bool context_handler(rocprofiler_group_t group, void* arg) { + hsa_agent_t agent{}; + hsa_status_t status = rocprofiler_get_agent(group.context, &agent); + check_status(status); + const rocprofiler::util::AgentInfo* agent_info = rocprofiler::util::HsaRsrcFactory::Instance().GetAgentInfo(agent); + + pcsmp_callback_data_t pcsmp_data{}; + pcsmp_data.kernel_name = (const char*)arg; + pcsmp_data.data_buffer = rocprofiler::util::HsaRsrcFactory::Instance().AllocateSysMemory(agent_info, rocprofiler::TraceProfile::GetSize()); + status = rocprofiler_iterate_trace_data(group.context, trace_data_cb, &pcsmp_data); + check_status(status); + return false; +} + +// Kernel disoatch callback +hsa_status_t dispatch_callback(const rocprofiler_callback_data_t* callback_data, void* user_data, + rocprofiler_group_t* group) { + // context features + const rocprofiler_feature_kind_t trace_kind = + (rocprofiler_feature_kind_t)(ROCPROFILER_FEATURE_KIND_TRACE | ROCPROFILER_FEATURE_KIND_PCSMP_MOD); + const uint32_t feature_count = 1; + const uint32_t parameter_count = 1; + rocprofiler_feature_t* features = new rocprofiler_feature_t[feature_count]; + memset(features, 0, feature_count * sizeof(rocprofiler_feature_t)); + rocprofiler_parameter_t* parameters = new rocprofiler_parameter_t[parameter_count]; + memset(features, 0, parameter_count * sizeof(rocprofiler_parameter_t)); + parameters[0].parameter_name = HSA_VEN_AMD_AQLPROFILE_PARAMETER_NAME_COMPUTE_UNIT_TARGET; + parameters[0].value = 0; + + features[0].kind = trace_kind; + features[0].parameters = parameters; + features[0].parameter_count = parameter_count; + + // context properties + rocprofiler_properties_t properties{}; + properties.handler = context_handler; + properties.handler_arg = (void*)strdup(callback_data->kernel_name); + + // Open profiling context + hsa_status_t status = rocprofiler_open(callback_data->agent, features, feature_count, + &context, 0 /*ROCPROFILER_MODE_SINGLEGROUP*/, &properties); + check_status(status); + + // Get group[0] + status = rocprofiler_get_group(context, 0, group); + check_status(status); + + return status; +} +} // namespace activity_prim + extern "C" { PUBLIC_API const char* GetOpName(uint32_t op) { return strdup("PCSAMPLE"); } @@ -63,10 +152,93 @@ PUBLIC_API bool RegisterApiCallback(uint32_t op, void* callback, void* arg) { re PUBLIC_API bool RemoveApiCallback(uint32_t op) { return true; } PUBLIC_API bool InitActivityCallback(void* callback, void* arg) { + activity_prim::activity_arg = arg; + activity_prim::activity_callback.store((activity_async_callback_t)callback, std::memory_order_release); + + rocprofiler_queue_callbacks_t queue_callbacks{}; + queue_callbacks.dispatch = activity_prim::dispatch_callback; + rocprofiler_set_queue_callbacks(queue_callbacks, NULL); + return true; } PUBLIC_API bool EnableActivityCallback(uint32_t op, bool enable) { + if (enable) { + activity_prim::activity_op = op; + rocprofiler_start_queue_callbacks(); + } else { + rocprofiler_stop_queue_callbacks(); + } + return true; +} + +struct evt_cb_entry_t { + typedef std::pair data_t; + data_t data; + std::mutex mutex; + + void set(const data_t& in) { + mutex.lock(); + data = in; + mutex.unlock(); + } + data_t get() { + mutex.lock(); + const data_t out = data; + mutex.unlock(); + return out; + } + evt_cb_entry_t() : data{} {} +}; +evt_cb_entry_t evt_cb_table[HSA_EVT_ID_NUMBER]; + +hsa_status_t codeobj_evt_callback( + rocprofiler_hsa_cb_id_t id, + const rocprofiler_hsa_callback_data_t* cb_data, + void* arg) +{ + const auto evt = evt_cb_table[id].get(); + activity_rtapi_callback_t evt_callback = (activity_rtapi_callback_t)evt.first; + if (evt_callback != NULL) evt_callback(ACTIVITY_DOMAIN_HSA_EVT, id, cb_data, evt.second); + return HSA_STATUS_SUCCESS; +} + +PUBLIC_API const char* GetEvtName(uint32_t op) { return strdup("CODEOBJ"); } + +PUBLIC_API bool RegisterEvtCallback(uint32_t op, void* callback, void* arg) { + evt_cb_table[op].set({callback, arg}); + + rocprofiler_hsa_callbacks_t ocb{}; + switch (op) { + case HSA_EVT_ID_ALLOCATE: + ocb.allocate = codeobj_evt_callback; + break; + case HSA_EVT_ID_DEVICE: + ocb.device = codeobj_evt_callback; + break; + case HSA_EVT_ID_MEMCOPY: + ocb.memcopy = codeobj_evt_callback; + break; + case HSA_EVT_ID_SUBMIT: + ocb.submit = codeobj_evt_callback; + break; + case HSA_EVT_ID_KSYMBOL: + ocb.ksymbol = codeobj_evt_callback; + break; + case HSA_EVT_ID_CODEOBJ: + ocb.codeobj = codeobj_evt_callback; + break; + default: + fatal("invalid activity opcode"); + } + rocprofiler_set_hsa_callbacks(ocb, NULL); + + return true; +} + +PUBLIC_API bool RemoveEvtCallback(uint32_t op) { + rocprofiler_hsa_callbacks_t ocb{}; + rocprofiler_set_hsa_callbacks(ocb, NULL); return true; } } // extern "C" diff --git a/src/core/activity.h b/src/core/activity.h new file mode 100644 index 00000000..ad64c0fa --- /dev/null +++ b/src/core/activity.h @@ -0,0 +1,26 @@ +#ifndef _SRC_CORE_ACTIVITY_H +#define _SRC_CORE_ACTIVITY_H + +#ifdef ROCP_INTERNAL_BUILD +#include "inc/rocprofiler.h" +#else +#include +#endif + +#include + +// HSA EVT ID enumeration +enum hsa_evt_id_t { + HSA_EVT_ID_ALLOCATE = ROCPROFILER_HSA_CB_ID_ALLOCATE, + HSA_EVT_ID_DEVICE = ROCPROFILER_HSA_CB_ID_DEVICE, + HSA_EVT_ID_MEMCOPY = ROCPROFILER_HSA_CB_ID_MEMCOPY, + HSA_EVT_ID_SUBMIT = ROCPROFILER_HSA_CB_ID_SUBMIT, + HSA_EVT_ID_KSYMBOL = ROCPROFILER_HSA_CB_ID_KSYMBOL, + HSA_EVT_ID_CODEOBJ = ROCPROFILER_HSA_CB_ID_CODEOBJ, + HSA_EVT_ID_NUMBER +}; + +// HSA EVT callback data type +typedef rocprofiler_hsa_callback_data_t hsa_evt_data_t; + +#endif // _SRC_CORE_ACTIVITY_H diff --git a/src/core/hsa_interceptor.h b/src/core/hsa_interceptor.h index 9207730b..ed20da96 100644 --- a/src/core/hsa_interceptor.h +++ b/src/core/hsa_interceptor.h @@ -51,7 +51,8 @@ SOFTWARE. (ID == ROCPROFILER_HSA_CB_ID_DEVICE) ? callbacks_.device: \ (ID == ROCPROFILER_HSA_CB_ID_MEMCOPY) ? callbacks_.memcopy: \ (ID == ROCPROFILER_HSA_CB_ID_SUBMIT) ? callbacks_.submit: \ - callbacks_.ksymbol; \ + (ID == ROCPROFILER_HSA_CB_ID_KSYMBOL) ? callbacks_.ksymbol: \ + callbacks_.codeobj; \ if ((__callback != NULL) && (recursion_ == false)) #define DO_HSA_CALLBACK \ @@ -230,12 +231,12 @@ class HsaInterceptor { rocprofiler_hsa_callback_data_t data{}; data.allocate.ptr = *ptr; data.allocate.size = size; - + HSA_RT(hsa_amd_memory_pool_get_info(pool, HSA_AMD_MEMORY_POOL_INFO_SEGMENT, &data.allocate.segment)); HSA_RT(hsa_amd_memory_pool_get_info(pool, HSA_AMD_MEMORY_POOL_INFO_GLOBAL_FLAGS, &data.allocate.global_flag)); - + DO_HSA_CALLBACK; - + IS_HSA_CALLBACK(ROCPROFILER_HSA_CB_ID_DEVICE) { // Scan the pool assigned devices agent_callback_data_t callback_data{pool, *ptr}; @@ -303,44 +304,116 @@ class HsaInterceptor { void* arg) { const int free_flag = reinterpret_cast(arg); - rocprofiler_hsa_callback_data_t data{}; + hsa_ven_amd_loader_code_object_storage_type_t storage_type = + HSA_VEN_AMD_LOADER_CODE_OBJECT_STORAGE_TYPE_NONE; + int storage_fd = -1; + uint64_t memory_base = 0; + uint64_t memory_size = 0; + uint64_t load_base = 0; + uint64_t load_size = 0; + uint64_t load_delta = 0; + uint32_t uri_len = 0; + char* uri_str = NULL; HSA_RT(LoaderApiTable.hsa_ven_amd_loader_loaded_code_object_get_info( loaded_code_object, - HSA_VEN_AMD_LOADER_LOADED_CODE_OBJECT_INFO_LOAD_BASE, - &data.allocate.ptr)); + HSA_VEN_AMD_LOADER_LOADED_CODE_OBJECT_INFO_CODE_OBJECT_STORAGE_TYPE, + &storage_type)); - if (free_flag == 0) { + if (storage_type == HSA_VEN_AMD_LOADER_CODE_OBJECT_STORAGE_TYPE_FILE) { HSA_RT(LoaderApiTable.hsa_ven_amd_loader_loaded_code_object_get_info( loaded_code_object, - HSA_VEN_AMD_LOADER_LOADED_CODE_OBJECT_INFO_LOAD_SIZE, - &data.allocate.size)); - } else { - data.allocate.size = 0; + HSA_VEN_AMD_LOADER_LOADED_CODE_OBJECT_INFO_CODE_OBJECT_STORAGE_FILE, + &storage_fd)); + if (storage_fd == -1) { + printf("CodeObjectCallback: fd == -1\n"); fflush(stdout); + abort(); + } + } else if (storage_type == HSA_VEN_AMD_LOADER_CODE_OBJECT_STORAGE_TYPE_MEMORY) { + HSA_RT(LoaderApiTable.hsa_ven_amd_loader_loaded_code_object_get_info( + loaded_code_object, + HSA_VEN_AMD_LOADER_LOADED_CODE_OBJECT_INFO_CODE_OBJECT_STORAGE_MEMORY_BASE, + &memory_base)); + HSA_RT(LoaderApiTable.hsa_ven_amd_loader_loaded_code_object_get_info( + loaded_code_object, + HSA_VEN_AMD_LOADER_LOADED_CODE_OBJECT_INFO_CODE_OBJECT_STORAGE_MEMORY_SIZE, + &memory_size)); + } + + HSA_RT(LoaderApiTable.hsa_ven_amd_loader_loaded_code_object_get_info( + loaded_code_object, + HSA_VEN_AMD_LOADER_LOADED_CODE_OBJECT_INFO_LOAD_BASE, + &load_base)); + HSA_RT(LoaderApiTable.hsa_ven_amd_loader_loaded_code_object_get_info( + loaded_code_object, + HSA_VEN_AMD_LOADER_LOADED_CODE_OBJECT_INFO_LOAD_SIZE, + &load_size)); + HSA_RT(LoaderApiTable.hsa_ven_amd_loader_loaded_code_object_get_info( + loaded_code_object, + HSA_VEN_AMD_LOADER_LOADED_CODE_OBJECT_INFO_LOAD_DELTA, + &load_delta)); + + // Getting URI + HSA_RT(LoaderApiTable.hsa_ven_amd_loader_loaded_code_object_get_info( + loaded_code_object, + HSA_VEN_AMD_LOADER_LOADED_CODE_OBJECT_INFO_URI_LENGTH, + &uri_len)); + + uri_str = (char*)calloc(uri_len + 1, sizeof(char)); + if (!uri_str) EXC_ABORT(HSA_STATUS_ERROR, "URI allocation"); + + HSA_RT(LoaderApiTable.hsa_ven_amd_loader_loaded_code_object_get_info( + loaded_code_object, + HSA_VEN_AMD_LOADER_LOADED_CODE_OBJECT_INFO_URI, + uri_str)); + + if (storage_type != HSA_VEN_AMD_LOADER_CODE_OBJECT_STORAGE_TYPE_NONE) { + IS_HSA_CALLBACK(ROCPROFILER_HSA_CB_ID_CODEOBJ) { + rocprofiler_hsa_callback_data_t data{}; + data.codeobj.storage_type = storage_type; + data.codeobj.storage_file = storage_fd; + data.codeobj.memory_base = memory_base; + data.codeobj.memory_size = memory_size; + data.codeobj.load_base = load_base; + data.codeobj.load_size = load_size; + data.codeobj.load_delta = load_delta; + data.codeobj.uri_length = uri_len; + data.codeobj.uri = uri_str; + data.codeobj.unload = free_flag; + + DO_HSA_CALLBACK; + } } - // Local GPU memory - // GLOBAL; FLAGS: COARSE GRAINED - data.allocate.segment = HSA_AMD_SEGMENT_GLOBAL; - data.allocate.global_flag = HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_COARSE_GRAINED; - data.allocate.is_code = 1; + { + IS_HSA_CALLBACK(ROCPROFILER_HSA_CB_ID_ALLOCATE) { + // Local GPU memory + // GLOBAL; FLAGS: COARSE GRAINED + rocprofiler_hsa_callback_data_t data{}; + data.allocate.ptr = reinterpret_cast(load_base); + data.allocate.size = (free_flag == 0) ? load_size : 0; + data.allocate.segment = HSA_AMD_SEGMENT_GLOBAL; + data.allocate.global_flag = HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_COARSE_GRAINED; + data.allocate.is_code = 1; - ISSUE_HSA_CALLBACK(ROCPROFILER_HSA_CB_ID_ALLOCATE); + DO_HSA_CALLBACK; + } + } - if (free_flag == 0) { + if (free_flag != 0) { IS_HSA_CALLBACK(ROCPROFILER_HSA_CB_ID_DEVICE) { hsa_amd_pointer_info_t pointer_info{}; uint32_t num_agents = 0; hsa_agent_t* agents = NULL; pointer_info.size = sizeof(hsa_amd_pointer_info_t); HSA_RT(hsa_amd_pointer_info( - const_cast(data.allocate.ptr), + reinterpret_cast(load_base), &pointer_info, malloc, &num_agents, &agents)); - - DeviceCallback(num_agents, agents, data.allocate.ptr); + + DeviceCallback(num_agents, agents, reinterpret_cast(load_base)); } } @@ -372,7 +445,7 @@ class HsaInterceptor { data.ksymbol.object = obj; data.ksymbol.name = name; data.ksymbol.name_length = len; - data.ksymbol.destroy = free_flag; + data.ksymbol.unload = free_flag; ISSUE_HSA_CALLBACK(ROCPROFILER_HSA_CB_ID_KSYMBOL); } @@ -388,22 +461,23 @@ class HsaInterceptor { HSA_RT(hsa_executable_freeze_fn(executable, options)); - IS_HSA_CALLBACK(ROCPROFILER_HSA_CB_ID_ALLOCATE) { + IS_HSA_CALLBACK(ROCPROFILER_HSA_CB_ID_KSYMBOL) { + HSA_RT(hsa_executable_iterate_symbols( + executable, + KernelSymbolCallback, + reinterpret_cast(0))); + } + + unsigned is_codeobj_cb = 0; + { IS_HSA_CALLBACK(ROCPROFILER_HSA_CB_ID_CODEOBJ) is_codeobj_cb |= 1; } + { IS_HSA_CALLBACK(ROCPROFILER_HSA_CB_ID_ALLOCATE) is_codeobj_cb |= 1; } + if (is_codeobj_cb) { LoaderApiTable.hsa_ven_amd_loader_executable_iterate_loaded_code_objects( executable, CodeObjectCallback, reinterpret_cast(0)); } - { - IS_HSA_CALLBACK(ROCPROFILER_HSA_CB_ID_KSYMBOL) { - HSA_RT(hsa_executable_iterate_symbols( - executable, - KernelSymbolCallback, - reinterpret_cast(0))); - } - } - return status; } diff --git a/src/core/rocprofiler.cpp b/src/core/rocprofiler.cpp index 5d1cd9c7..a544ca58 100644 --- a/src/core/rocprofiler.cpp +++ b/src/core/rocprofiler.cpp @@ -219,6 +219,7 @@ uint32_t LoadTool() { if (settings.k_concurrent) { Context::k_concurrent_ = settings.k_concurrent; InterceptQueue::k_concurrent_ = settings.k_concurrent; + InterceptQueue::TrackerOn(true); } if (settings.opt_mode) InterceptQueue::opt_mode_ = true; } @@ -536,7 +537,8 @@ PUBLIC_API bool OnLoad(HsaApiTable* table, uint64_t runtime_version, uint64_t fa rocprofiler::StandaloneIntercept(); } - ONLOAD_TRACE_END(); + ONLOAD_TRACE("end intercept_mode(" << std::hex << intercept_mode << ")" << + " intercept_mode_mask(" << std::hex << intercept_mode_mask << ")" << std::dec); return true; } diff --git a/test/tool/tool.cpp b/test/tool/tool.cpp index 34650a34..4bdce5dd 100644 --- a/test/tool/tool.cpp +++ b/test/tool/tool.cpp @@ -87,6 +87,7 @@ struct kernel_properties_t { uint32_t sgpr_count; uint32_t fbarrier_count; hsa_signal_t signal; + uint64_t object; }; // Context stored entry type @@ -392,7 +393,7 @@ bool dump_context_entry(context_entry_t* entry, bool to_clean = true) { const std::string nik_name = (to_truncate_names == 0) ? entry->data.kernel_name : filtr_kernel_name(entry->data.kernel_name); const AgentInfo* agent_info = HsaRsrcFactory::Instance().GetAgentInfo(entry->agent); - fprintf(file_handle, "dispatch[%u], gpu-id(%u), queue-id(%u), queue-index(%lu), pid(%u), tid(%u), grd(%u), wgr(%u), lds(%u), scr(%u), vgpr(%u), sgpr(%u), fbar(%u), sig(0x%lx), kernel-name(\"%s\")", + fprintf(file_handle, "dispatch[%u], gpu-id(%u), queue-id(%u), queue-index(%lu), pid(%u), tid(%u), grd(%u), wgr(%u), lds(%u), scr(%u), vgpr(%u), sgpr(%u), fbar(%u), sig(0x%lx), obj(0x%lx), kernel-name(\"%s\")", index, agent_info->dev_index, entry->data.queue_id, @@ -407,6 +408,7 @@ bool dump_context_entry(context_entry_t* entry, bool to_clean = true) { (entry->kernel_properties.sgpr_count + agent_info->sgpr_block_dflt) * agent_info->sgpr_block_size, entry->kernel_properties.fbarrier_count, entry->kernel_properties.signal.handle, + entry->kernel_properties.object, nik_name.c_str()); if (record) fprintf(file_handle, ", time(%lu,%lu,%lu,%lu)", record->dispatch, @@ -540,6 +542,37 @@ bool context_pool_handler(const rocprofiler_pool_entry_t* entry, void* arg) { return false; } +// Profiling completion handler for concurrent implementation +// Dump the context entry +// Return true if the context was dumped successfully +bool context_handler_con(rocprofiler_group_t group, void* arg) { + context_entry_t* entry = reinterpret_cast(arg); + + if (pthread_mutex_lock(&mutex) != 0) { + perror("pthread_mutex_lock"); + abort(); + } + + bool ret = true; + ret = dump_context_entry(entry); + if (ret == false) { + fprintf(stderr, "tool error: context is not complete\n"); + abort(); + } + + if (trace_on) { + fprintf(stdout, "tool::handler_con: context_map %d tid %u\n", (int)(ctx_a_map->size()), GetTid()); + fflush(stdout); + } + + if (pthread_mutex_unlock(&mutex) != 0) { + perror("pthread_mutex_unlock"); + abort(); + } + + return false; +} + bool check_filter(const rocprofiler_callback_data_t* callback_data, const callbacks_data_t* tool_data) { bool found = true; @@ -617,6 +650,7 @@ void set_kernel_properties(const rocprofiler_callback_data_t* callback_data, kernel_properties_ptr->sgpr_count = AMD_HSA_BITS_GET(kernel_code->compute_pgm_rsrc1, AMD_COMPUTE_PGM_RSRC_ONE_GRANULATED_WAVEFRONT_SGPR_COUNT); kernel_properties_ptr->fbarrier_count = kernel_code->workgroup_fbarrier_count; kernel_properties_ptr->signal = callback_data->completion_signal; + kernel_properties_ptr->object = callback_data->packet->kernel_object; } // Kernel disoatch callback @@ -881,6 +915,7 @@ rocprofiler_hsa_callbacks_t hsa_callbacks { hsa_unified_callback, hsa_unified_callback, hsa_unified_callback, + NULL, NULL }; @@ -889,7 +924,7 @@ hsa_status_t hsa_ksymbol_cb(rocprofiler_hsa_cb_id_t id, const rocprofiler_hsa_callback_data_t* data, void* arg) { - HsaRsrcFactory::SetKernelNameRef(data->ksymbol.object, data->ksymbol.name, data->ksymbol.destroy); + HsaRsrcFactory::SetKernelNameRef(data->ksymbol.object, data->ksymbol.name, data->ksymbol.unload); return HSA_STATUS_SUCCESS; } @@ -1195,8 +1230,6 @@ void rocprofiler_unload(bool is_destr) { abort(); } - if (is_destr) CTX_OUTSTANDING_WAIT = 0; - // Unregister dispatch callback rocprofiler_remove_queue_callbacks(); @@ -1216,7 +1249,6 @@ void rocprofiler_unload(bool is_destr) { } fflush(stdout); -#if 0 // Cleanup if (callbacks_data != NULL) { delete[] callbacks_data->features; @@ -1233,7 +1265,6 @@ void rocprofiler_unload(bool is_destr) { range_vec = NULL; delete context_array; context_array = NULL; -#endif ONLOAD_TRACE_END(); } From 939b0e2124f8bb10126cee48496d5692079a4709 Mon Sep 17 00:00:00 2001 From: Bert Wesarg Date: Sat, 12 Dec 2020 09:03:23 +0100 Subject: [PATCH 134/153] Avoid empty-argument lists in C function declarations `()` as a function argument list in C is equivalent to `(...)`, i.e, in C you get at most a warning, if it is called with too many arguments. Clarify this situation by explicitly stating `(void)` as argument list. --- inc/rocprofiler.h | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/inc/rocprofiler.h b/inc/rocprofiler.h index 4a966190..0adbb585 100644 --- a/inc/rocprofiler.h +++ b/inc/rocprofiler.h @@ -57,8 +57,8 @@ extern "C" { //////////////////////////////////////////////////////////////////////////////// // Returning library version -uint32_t rocprofiler_version_major(); -uint32_t rocprofiler_version_minor(); +uint32_t rocprofiler_version_major(void); +uint32_t rocprofiler_version_minor(void); //////////////////////////////////////////////////////////////////////////////// // Global properties structure @@ -275,11 +275,11 @@ hsa_status_t rocprofiler_set_queue_callbacks( void* data); // [in/out] passed callbacks data // Remove queue callbacks -hsa_status_t rocprofiler_remove_queue_callbacks(); +hsa_status_t rocprofiler_remove_queue_callbacks(void); // Start/stop queue callbacks -hsa_status_t rocprofiler_start_queue_callbacks(); -hsa_status_t rocprofiler_stop_queue_callbacks(); +hsa_status_t rocprofiler_start_queue_callbacks(void); +hsa_status_t rocprofiler_stop_queue_callbacks(void); //////////////////////////////////////////////////////////////////////////////// // Start/stop profiling From 58ae9b67ae070868520118b8e6f4c64d950b6111 Mon Sep 17 00:00:00 2001 From: eshcherb <33529668+eshcherb@users.noreply.github.com> Date: Tue, 19 Jan 2021 12:53:59 -0600 Subject: [PATCH 135/153] Update rocprofiler_spec.md --- doc/rocprofiler_spec.md | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/doc/rocprofiler_spec.md b/doc/rocprofiler_spec.md index 75ab58b7..37952a56 100644 --- a/doc/rocprofiler_spec.md +++ b/doc/rocprofiler_spec.md @@ -746,7 +746,9 @@ Fetching a context entry: ``` ### 5.5. Standalone Sampling Usage Code Example ``` -The profiling metrics are being read from separate standalone queue other than the application kernels are submitted to. To enable the sampling mode, the profiling mode in all user queues should be enabled. It can be done by loading ROC-profiler library to HSA runtime using the environment variable HSA_TOOLS_LIB for all shell sessions. +The profiling metrics are being read from separate standalone queue other than the application kernels are submitted to. +To enable the sampling mode, the profiling mode in all user queues should be enabled. It can be done by loading ROC-profiler +library to HSA runtime using the environment variable HSA_TOOLS_LIB for all shell sessions. // Sampling rate uint32_t sampling_rate = ; // Sampling count From 4f948e5e17d80c789409b06f524923be98129ab5 Mon Sep 17 00:00:00 2001 From: eshcherb <33529668+eshcherb@users.noreply.github.com> Date: Wed, 20 Jan 2021 19:50:21 -0600 Subject: [PATCH 136/153] Update rocprof.md --- doc/rocprof.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/doc/rocprof.md b/doc/rocprof.md index 717653eb..3b4c9f99 100644 --- a/doc/rocprof.md +++ b/doc/rocprof.md @@ -193,7 +193,9 @@ Tracing can be filtered by events names using profiler input file and by enablin #### 3.2.1. Filtering traced APIs A list of traced API names can be specified in profiler input file. An example of input file line for ROCr runtime trace (HAS API): +``` hsa: hsa_queue_create hsa_amd_memory_pool_allocate +``` #### 3.2.2. Tracing time period Trace can be dumped periodically with initial delay, dumping period length and rate: ``` From 8cd086e6fd568367056c79dd7d14e07d67c76861 Mon Sep 17 00:00:00 2001 From: eshcherb <33529668+eshcherb@users.noreply.github.com> Date: Mon, 8 Feb 2021 10:01:48 -0600 Subject: [PATCH 137/153] Update README.md --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index 91361b56..85276d07 100644 --- a/README.md +++ b/README.md @@ -68,6 +68,7 @@ To use the rocProfiler API you need the API header and to link your application ## Internal 'simple_convolution' test run script: ``` cd .../rocprofiler/build + make mytest run.sh ``` From 93778bdc4fa5403fedede3afcc1155338c62a6f3 Mon Sep 17 00:00:00 2001 From: Evgeny Date: Thu, 8 Apr 2021 10:13:00 -0500 Subject: [PATCH 138/153] cleanup --- test/tool/input.xml | 14 -------------- test/tool/input1.xml | 14 -------------- test/tool/input2.xml | 5 ----- 3 files changed, 33 deletions(-) delete mode 100644 test/tool/input.xml delete mode 100644 test/tool/input1.xml delete mode 100644 test/tool/input2.xml diff --git a/test/tool/input.xml b/test/tool/input.xml deleted file mode 100644 index f4ecd178..00000000 --- a/test/tool/input.xml +++ /dev/null @@ -1,14 +0,0 @@ -# Filter by dispatches range, GPU index and kernel names - - -# List of metrics - diff --git a/test/tool/input1.xml b/test/tool/input1.xml deleted file mode 100644 index f4ecd178..00000000 --- a/test/tool/input1.xml +++ /dev/null @@ -1,14 +0,0 @@ -# Filter by dispatches range, GPU index and kernel names - - -# List of metrics - diff --git a/test/tool/input2.xml b/test/tool/input2.xml deleted file mode 100644 index 254c83dc..00000000 --- a/test/tool/input2.xml +++ /dev/null @@ -1,5 +0,0 @@ -# List of metrics - From 48cc7855d52e0024071447aa061c65c6fa204fcd Mon Sep 17 00:00:00 2001 From: Christophe Paquot Date: Tue, 22 Jun 2021 21:12:08 -0700 Subject: [PATCH 139/153] SWDEV-282961: dependency arrows missing When building the json data flow, from_us_list has (timestamp, stream_id, thread_id). stream_id used to be interpreted as from_tid and tid as to_tid. But that's not correct. stream_id is always a destination and tid is the initiator (source). Change-Id: I2f5bb86a387b4003b17271c90bdf9de4b59a79bf --- bin/sqlitedb.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/bin/sqlitedb.py b/bin/sqlitedb.py index dc5358ff..00a7dba2 100644 --- a/bin/sqlitedb.py +++ b/bin/sqlitedb.py @@ -159,11 +159,11 @@ def flow_json(self, base_id, from_pid, from_us_list, to_pid, to_us_dict, corr_id for ind in range(len(from_us_list)): corr_id = corr_id_list[ind] if (len(corr_id_list) != 0) else ind if corr_id in to_us_dict: - (from_ts, from_tid, to_tid) = from_us_list[ind] + (from_ts, stream_id, tid) = from_us_list[ind] to_ts = to_us_dict[corr_id] if from_ts > to_ts: from_ts = to_ts - fd.write(',{"ts":%d,"ph":"s","cat":"DataFlow","id":%d,"pid":%d,"tid":%d,"name":"dep"}\n' % (from_ts, dep_id, from_pid, from_tid)) - fd.write(',{"ts":%d,"ph":"t","cat":"DataFlow","id":%d,"pid":%d,"tid":%d,"name":"dep"}\n' % (to_ts, dep_id, to_pid, to_tid)) + fd.write(',{"ts":%d,"ph":"s","cat":"DataFlow","id":%d,"pid":%d,"tid":%d,"name":"dep"}\n' % (from_ts, dep_id, from_pid, tid)) + fd.write(',{"ts":%d,"ph":"t","cat":"DataFlow","id":%d,"pid":%d,"tid":%d,"name":"dep"}\n' % (to_ts, dep_id, to_pid, stream_id)) dep_id += 1 def metadata_json(self, jsonfile, sysinfo_file): From 9f0ca101ec2286827c9b1cd8ccfa2ee3438f6a15 Mon Sep 17 00:00:00 2001 From: AMD Date: Wed, 16 Jun 2021 18:33:58 -0500 Subject: [PATCH 140/153] Add support for gfx90a Merge gfx90a support from the 'amd-npi' branch. Change-Id: I9b51711ed4a1d2f1ed42ba9b83cb12136be228b8 (cherry picked from commit 4df3e0bd9ae6e5982b43fd2fc3867cf5f0b87a53) --- src/core/metrics.h | 6 +++--- test/tool/gfx_metrics.xml | 17 +++++++++++++++++ test/tool/metrics.xml | 8 ++++++-- 3 files changed, 26 insertions(+), 5 deletions(-) diff --git a/src/core/metrics.h b/src/core/metrics.h index a221168a..6eeebee3 100644 --- a/src/core/metrics.h +++ b/src/core/metrics.h @@ -196,9 +196,9 @@ class MetricsDict { xml_->AddConst("top.const.metric", "SE_NUM", agent_info->se_num); ImportMetrics(agent_info, "const"); agent_name_ = agent_info->name; - if (std::string("gfx906") == agent_info->name) { - ImportMetrics(agent_info, agent_info->name); - } else if (std::string("gfx908") == agent_info->name) { + if (std::string("gfx906") == agent_info->name || + std::string("gfx908") == agent_info->name || + std::string("gfx90a") == agent_info->name) { ImportMetrics(agent_info, agent_info->name); } else { agent_name_ = agent_info->gfxip; diff --git a/test/tool/gfx_metrics.xml b/test/tool/gfx_metrics.xml index 698826c6..c2a79af2 100644 --- a/test/tool/gfx_metrics.xml +++ b/test/tool/gfx_metrics.xml @@ -101,3 +101,20 @@ + + + + + + + + + + + + + + + + + diff --git a/test/tool/metrics.xml b/test/tool/metrics.xml index c340a439..a920ff04 100644 --- a/test/tool/metrics.xml +++ b/test/tool/metrics.xml @@ -65,12 +65,16 @@ + + -# VG20 +# Vega20 -# MI100 +# Arcturus +# Aldebaran + # GPUBusy The percentage of time GPU was busy. From a369af3049bc3a6d2faca537f4e47d5a046f01cf Mon Sep 17 00:00:00 2001 From: rachida Date: Tue, 18 May 2021 20:11:48 -0400 Subject: [PATCH 141/153] SWDEV-282961 Skip barrier events. Process hipMemSet events Marker events inside hcc_ops_trace.txt are from barriers so they are not meant to be stored in ops_patch_data map. Added support for hipMemset events which are a kind of memory copy. Change-Id: I213fe959bcd35ff0371613ba5bffd95bc53e06b5 (cherry picked from commit caa5f323007734fd0b14b3fa49618a5d7cc7acdd) --- bin/tblextr.py | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/bin/tblextr.py b/bin/tblextr.py index 61644e2a..deafb199 100755 --- a/bin/tblextr.py +++ b/bin/tblextr.py @@ -354,7 +354,7 @@ def fill_api_db(table_name, db, indir, api_name, api_pid, dep_pid, dep_list, dep copy_index = 0 ptrn_val = re.compile(r'(\d+):(\d+) (\d+):(\d+) ([^\(]+)(\(.*)$') - hip_mcopy_ptrn = re.compile(r'hipMemcpy') + hip_mcopy_ptrn = re.compile(r'hipMemcpy|hipMemset') hip_wait_event_ptrn = re.compile(r'WaitEvent') hip_sync_event_ptrn = re.compile(r'hipStreamSynchronize') hip_sync_dev_event_ptrn = re.compile(r'hipDeviceSynchronize') @@ -430,7 +430,6 @@ def fill_api_db(table_name, db, indir, api_name, api_pid, dep_pid, dep_list, dep if corr_id == 0: corr_id = record_id rec_vals.append(corr_id) - # extracting/converting stream id (stream_id, stream_found) = get_field(record_args, 'stream') if stream_found == 0: @@ -489,9 +488,6 @@ def fill_api_db(table_name, db, indir, api_name, api_pid, dep_pid, dep_list, dep mcopy_found = 1 op_found = 1 - if op_found: - ops_patch_data[(corr_id, proc_id)] = (thread_id, stream_id, kernel_str) - # HIP WaitEvent API if wait_event_ptrn.search(record_name): op_found = 1 @@ -505,6 +501,9 @@ def fill_api_db(table_name, db, indir, api_name, api_pid, dep_pid, dep_list, dep hsa_patch_data[(copy_index, proc_id)] = thread_id copy_index += 1 + if op_found: + ops_patch_data[(corr_id, proc_id)] = (thread_id, stream_id, kernel_str) + if op_found: op_found = 0 beg_ns = int(rec_vals[0]) @@ -650,6 +649,7 @@ def fill_ops_db(kernel_table_name, mcopy_table_name, db, indir): proc_id = int(m.group(3)) # checking name for memcopy pattern + is_barrier = 0 if ptrn_mcopy.search(name): rec_table_name = mcopy_table_name table_handle = mcopy_table_handle @@ -664,6 +664,7 @@ def fill_ops_db(kernel_table_name, mcopy_table_name, db, indir): if ptrn_barrier.search(name): name = '""' + is_barrier = 1 thread_id = 0 stream_id = 0 @@ -671,7 +672,8 @@ def fill_ops_db(kernel_table_name, mcopy_table_name, db, indir): (thread_id, stream_id, name_patch) = ops_patch_data[(corr_id, proc_id)] if name_patch != '': name = name_patch else: - fatal("hcc ops data not found: '" + record + "', " + str(corr_id) + ", " + str(proc_id)) + if is_barrier: continue + else: fatal("hcc ops data not found: '" + record + "', " + str(corr_id) + ", " + str(proc_id)) # activity record rec_vals[4] = name # Name @@ -878,3 +880,4 @@ def fill_ops_db(kernel_table_name, mcopy_table_name, db, indir): sys.exit(0) ############################################################# + From 2f189791a5cf6a6beb0aea9572d8760f792f0b32 Mon Sep 17 00:00:00 2001 From: Icarus Sparry Date: Wed, 7 Jul 2021 15:49:02 +0000 Subject: [PATCH 142/153] Add dependency on rocm-core Signed-off-by: Icarus Sparry Change-Id: Icb935e9230888fd005d9ca3617e28f6173173cc8 --- CMakeLists.txt | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index c34f7cc9..e47f06df 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -204,7 +204,7 @@ else() endif() message ( "Using CPACK_DEBIAN_PACKAGE_RELEASE ${CPACK_DEBIAN_PACKAGE_RELEASE}" ) set ( CPACK_DEBIAN_FILE_NAME "DEB-DEFAULT" ) -set ( CPACK_DEBIAN_PACKAGE_DEPENDS "hsa-rocr-dev" ) +set ( CPACK_DEBIAN_PACKAGE_DEPENDS "hsa-rocr-dev, rocm-core" ) ## Process the Debian install/remove scripts to update the CPACK variables configure_file ( ${CMAKE_CURRENT_SOURCE_DIR}/DEBIAN/postinst.in DEBIAN/postinst @ONLY ) configure_file ( ${CMAKE_CURRENT_SOURCE_DIR}/DEBIAN/prerm.in DEBIAN/prerm @ONLY ) @@ -229,12 +229,17 @@ if ( PROC_RESULT EQUAL "0" AND NOT EVAL_RESULT STREQUAL "" ) string ( APPEND CPACK_RPM_PACKAGE_RELEASE "%{?dist}" ) endif() set ( CPACK_RPM_FILE_NAME "RPM-DEFAULT" ) -set ( CPACK_RPM_PACKAGE_DEPENDS "hsa-rocr-dev" ) +set ( CPACK_RPM_PACKAGE_REQUIRES "hsa-rocr-dev, rocm-core" ) ## Process the Rpm install/remove scripts to update the CPACK variables configure_file ( "${CMAKE_CURRENT_SOURCE_DIR}/RPM/post.in" RPM/post @ONLY ) configure_file ( "${CMAKE_CURRENT_SOURCE_DIR}/RPM/postun.in" RPM/postun @ONLY ) set ( CPACK_RPM_POST_INSTALL_SCRIPT_FILE "${CMAKE_CURRENT_BINARY_DIR}/RPM/post" ) set ( CPACK_RPM_POST_UNINSTALL_SCRIPT_FILE "${CMAKE_CURRENT_BINARY_DIR}/RPM/postun" ) +# Remove dependency on rocm-core if -DROCM_DEP_ROCMCORE=ON not given to cmake +if(NOT ROCM_DEP_ROCMCORE) + string(REGEX REPLACE ",? ?rocm-core" "" CPACK_RPM_PACKAGE_REQUIRES ${CPACK_RPM_PACKAGE_REQUIRES}) + string(REGEX REPLACE ",? ?rocm-core" "" CPACK_DEBIAN_PACKAGE_DEPENDS ${CPACK_DEBIAN_PACKAGE_DEPENDS}) +endif() include ( CPack ) From ae6c093864009c8974c50db3b52d21839663738a Mon Sep 17 00:00:00 2001 From: Ammar ELWazir Date: Wed, 27 Oct 2021 19:06:30 -0600 Subject: [PATCH 143/153] Adding Known Issues --- README.md | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/README.md b/README.md index 85276d07..48033a1f 100644 --- a/README.md +++ b/README.md @@ -195,3 +195,10 @@ Configuration file: obj-tracking=on > ``` + + +## Known Issues: +- For workloads where the hip application might make more than 10 million HIP API calls, the application might crash with the error - "Profiling data corrupted" + - Suggested Workaround - Instead of profiling for the complete run, it is suggested to run profiling in parts by using the --trace-period option. +- When the same kernel is launched back to back multiple times on a GPU, the cache hit rate from rocprofiler is reported as 0% or very low. This also causes FETCH_SIZE to be not usable for repeatable kernel. +- OpenMP applications are not fully supported by the rocprofiler. From 83592409edf7e69695abe0d6fd1ae0e8c8dc6e69 Mon Sep 17 00:00:00 2001 From: Ammar ELWazir Date: Wed, 27 Oct 2021 19:07:52 -0600 Subject: [PATCH 144/153] Updating Known Issues --- README.md | 1 - 1 file changed, 1 deletion(-) diff --git a/README.md b/README.md index 48033a1f..95a3476f 100644 --- a/README.md +++ b/README.md @@ -201,4 +201,3 @@ Configuration file: - For workloads where the hip application might make more than 10 million HIP API calls, the application might crash with the error - "Profiling data corrupted" - Suggested Workaround - Instead of profiling for the complete run, it is suggested to run profiling in parts by using the --trace-period option. - When the same kernel is launched back to back multiple times on a GPU, the cache hit rate from rocprofiler is reported as 0% or very low. This also causes FETCH_SIZE to be not usable for repeatable kernel. -- OpenMP applications are not fully supported by the rocprofiler. From b71b5414d1e1d52d2dfe30b7216fc809613a0f44 Mon Sep 17 00:00:00 2001 From: Ammar ELWazir Date: Fri, 21 Jan 2022 12:05:10 -0600 Subject: [PATCH 145/153] SWDEV-318551: Adding License file for profiler Making the new License file, Adding support in the CMakeLists.txt Change-Id: I785035a780fbfc59951fc27d45f9c1869ffb4fb3 --- CMakeLists.txt | 5 +++++ LICENSE | 4 ++-- 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 68f9b86a..dfb069be 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -191,6 +191,9 @@ set ( CPACK_PACKAGE_CONTACT "ROCm Profiler Support Date: Wed, 9 Feb 2022 14:18:35 -0600 Subject: [PATCH 146/153] Update Readme --- Readme.txt | 211 ++++++++++++++++++++++++++++++++++++++++++++--------- 1 file changed, 178 insertions(+), 33 deletions(-) diff --git a/Readme.txt b/Readme.txt index 9008165a..e83b410c 100644 --- a/Readme.txt +++ b/Readme.txt @@ -1,54 +1,199 @@ -ROC Profiler library. -Profiling with metrics and traces based on perfcounters (PMC) and traces (SPM). -Implementation is based on AqlProfile HSA extension. -Library supports GFX8/GFX9. +# ROC-profiler +ROC profiler library. Profiling with perf-counters and derived metrics. Library supports GFX8/GFX9. -The library source tree: - - doc - Documentation +HW specific low-level performance analysis interface for profiling of GPU compute applications. The +profiling includes HW performance counters with complex performance metrics. + +To use the rocProfiler API you need the API header and to link your application with roctracer .so librray: + - the API header: /opt/rocm/rocprofiler/include/rocprofiler.h + - the .so library: /opt/rocm/lib/librocprofiler64.so + +## Documentation +- ['rocprof' cmdline tool specification](doc/rocprof.md) +- ['rocprofiler' profiling C API specification](doc/rocprofiler_spec.md) + +## Metrics +[The link to profiler default metrics XML specification](test/tool/metrics.xml) + + +## Source tree +``` + - bin + - rocprof - Profiling tool run script + - doc - Documentation - inc/rocprofiler.h - Library public API - src - Library sources - core - Library API sources - util - Library utils sources - xml - XML parser - test - Library test suite + - tool - Profiling tool + - tool.cpp - tool sources + - metrics.xml - metrics config file - ctrl - Test controll - util - Test utils - simple_convolution - Simple convolution test kernel +``` + +## Build environment: +``` + export CMAKE_PREFIX_PATH=: + export CMAKE_BUILD_TYPE= # release by default + export CMAKE_DEBUG_TRACE=1 # to enable debug tracing +``` + +## To build with the current installed ROCM: +``` + - ROCm is required. + ROCr-runtime and roctracer are needed + + - Python is required. + The required modules: CppHeaderParser, argparse, sqlite3 + To install: + sudo pip install CppHeaderParser argparse sqlite3 + + - To build and install to /opt/rocm/rocprofiler + Please use release branches/tags of 'amd-master' branch for development version. + + export CMAKE_PREFIX_PATH=/opt/rocm/include/hsa:/opt/rocm + + cd .../rocprofiler + ./build.sh +``` + +## Internal 'simple_convolution' test run script: +``` + cd .../rocprofiler/build + make mytest + run.sh +``` + +## To enable error messages logging to '/tmp/rocprofiler_log.txt': +``` + export ROCPROFILER_LOG=1 +``` + +## To enable verbose tracing: +``` + export ROCPROFILER_TRACE=1 +``` + +## Profiling utility usage: +``` +rocprof [-h] [--list-basic] [--list-derived] [-i ] [-o ] + +Options: + -h - this help + --verbose - verbose mode, dumping all base counters used in the input metrics + --list-basic - to print the list of basic HW counters + --list-derived - to print the list of derived metrics with formulas + --cmd-qts - quoting profiled cmd-line [on] + + -i <.txt|.xml file> - input file + Input file .txt format, automatically rerun application for every pmc line: + + # Perf counters group 1 + pmc : Wavefronts VALUInsts SALUInsts SFetchInsts FlatVMemInsts LDSInsts FlatLDSInsts GDSInsts FetchSize + # Perf counters group 2 + pmc : VALUUtilization,WriteSize L2CacheHit + # Filter by dispatches range, GPU index and kernel names + # supported range formats: "3:9", "3:", "3" + range: 1 : 4 + gpu: 0 1 2 3 + kernel: simple Pass1 simpleConvolutionPass2 -Build environment: + Input file .xml format, for single profiling run: -$ export CMAKE_PREFIX_PATH=: -$ export CMAKE_BUILD_TYPE= # release by default -$ export CMAKE_DEBUG_TRACE=1 # 1 to enable debug tracing + # Metrics list definition, also the form ":" can be used + # All defined metrics can be found in the 'metrics.xml' + # There are basic metrics for raw HW counters and high-level metrics for derived counters + -To build with the current installed ROCM: + # Filter by dispatches range, GPU index and kernel names + -$ cd .../rocprofiler -$ export CMAKE_PREFIX_PATH=/opt/rocm/include/hsa:/opt/rocm -$ mkdir build -$ cd build -$ cmake .. -$ make + -o - output CSV file [.csv] + The output CSV file columns meaning in the columns order: + Index - kernels dispatch order index + KernelName - the dispatched kernel name + gpu-id - GPU id the kernel was submitted to + queue-id - the ROCm queue unique id the kernel was submitted to + queue-index - The ROCm queue write index for the submitted AQL packet + tid - system application thread id which submitted the kernel + grd - the kernel's grid size + wgr - the kernel's work group size + lds - the kernel's LDS memory size + scr - the kernel's scratch memory size + vgpr - the kernel's VGPR size + sgpr - the kernel's SGPR size + fbar - the kernel's barriers limitation + sig - the kernel's completion signal + ... - The columns with the counters values per kernel dispatch + DispatchNs/BeginNs/EndNs/CompleteNs - timestamp columns if time-stamping was enabled + + -d - directory where profiler store profiling data including thread treaces [/tmp] + The data directory is renoving autonatically if the directory is matching the temporary one, which is the default. + -t - to change the temporary directory [/tmp] + By changing the temporary directory you can prevent removing the profiling data from /tmp or enable removing from not '/tmp' directory. -To run the test: + --basenames - to turn on/off truncating of the kernel full function names till the base ones [off] + --timestamp - to turn on/off the kernel dispatches timestamps, dispatch/begin/end/complete [off] + Four kernel timestamps in nanoseconds are reported: + DispatchNs - the time when the kernel AQL dispatch packet was written to the queue + BeginNs - the kernel execution begin time + EndNs - the kernel execution end time + CompleteNs - the time when the completion signal of the AQL dispatch packet was received -$ cd .../rocprofiler/build -$ export LD_LIBRARY_PATH=.: # paths to ROC profiler and oher libraries -$ export HSA_TOOLS_LIB=librocprofiler64.so # ROC profiler library loaded by HSA runtime -$ export ROCP_TOOL_LIB=test/libtool.so # tool library loaded by ROC profiler -$ export ROCP_METRICS=metrics.xml # ROC profiler metrics config file -$ export ROCP_INPUT=input.xml # input file for the tool library -$ export ROCP_OUTPUT_DIR=./ # output directory for the tool library, for metrics results file 'results.txt' and trace files -$ + --ctx-limit - maximum number of outstanding contexts [0 - unlimited] + --heartbeat - to print progress heartbeats [0 - disabled] + --obj-tracking - to turn on/off kernels code objects tracking [on] + To support V3 code-object. -Internal 'simple_convolution' test run script: -$ cd .../rocprofiler/build -$ run.sh + --stats - generating kernel execution stats, file .stats.csv + + --roctx-trace - to enable rocTX application code annotation trace, "Markers and Ranges" JSON trace section. + --sys-trace - to trace HIP/HSA APIs and GPU activity, generates stats and JSON trace chrome-tracing compatible + --hip-trace - to trace HIP, generates API execution stats and JSON file chrome-tracing compatible + --hsa-trace - to trace HSA, generates API execution stats and JSON file chrome-tracing compatible + --kfd-trace - to trace KFD, generates API execution stats and JSON file chrome-tracing compatible + Generated files: ._stats.txt .json + Traced API list can be set by input .txt or .xml files. + Input .txt: + hsa: hsa_queue_create hsa_amd_memory_pool_allocate + Input .xml: + + + + -To enabled error messages logging to '/tmp/rocprofiler_log.txt': + --trace-start - to enable tracing on start [on] + --trace-period - to enable trace with initial delay, with periodic sample length and rate + Supported time formats: -$ export ROCPROFILER_LOG=1 +Configuration file: + You can set your parameters defaults preferences in the configuration file 'rpl_rc.xml'. The search path sequence: .:$HOME: + First the configuration file is looking in the current directory, then in your home, and then in the package directory. + Configurable options: 'basenames', 'timestamp', 'ctx-limit', 'heartbeat', 'obj-tracking'. + An example of 'rpl_rc.xml': + +``` -To enable verbose tracing: -$ export ROCPROFILER_TRACE=1 +## Known Issues: +- For workloads where the hip application might make more than 10 million HIP API calls, the application might crash with the error - "Profiling data corrupted" + - Suggested Workaround - Instead of profiling for the complete run, it is suggested to run profiling in parts by using the --trace-period option. +- When the same kernel is launched back to back multiple times on a GPU, the cache hit rate from rocprofiler is reported as 0% or very low. This also causes FETCH_SIZE to be not usable for repeatable kernel. From 967d6c27259f0e2b1d0bf254b14e8ec7cd70009b Mon Sep 17 00:00:00 2001 From: Ammar ELWazir Date: Wed, 9 Feb 2022 14:19:02 -0600 Subject: [PATCH 147/153] Rename Readme.txt to README.md --- Readme.txt => README.md | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename Readme.txt => README.md (100%) diff --git a/Readme.txt b/README.md similarity index 100% rename from Readme.txt rename to README.md From b318ef99a9a23bdf827234de59888b4dfddd35e7 Mon Sep 17 00:00:00 2001 From: Ammar ELWazir Date: Thu, 24 Feb 2022 10:34:52 -0600 Subject: [PATCH 148/153] Create rocprof.md --- doc/rocprof.md | 393 +++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 393 insertions(+) create mode 100644 doc/rocprof.md diff --git a/doc/rocprof.md b/doc/rocprof.md new file mode 100644 index 00000000..3b4c9f99 --- /dev/null +++ b/doc/rocprof.md @@ -0,0 +1,393 @@ +# rocprof +## 1. Overview +The rocProf is a command line tool implemented on the top of rocProfiler and rocTracer APIs. Source code for rocProf may be found here: +GitHub: https://github.com/ROCm-Developer-Tools/rocprofiler/blob/amd-master/bin/rocprof +This command line tool is implemented as a script which is setting up the environment for attaching the profiler and then run the provided application command line. The tool uses two profiling plugins loaded by ROC runtime and based on rocProfiler and rocTracer for collecting metrics/counters, HW traces and runtime API/activity traces. The tool consumes an input XML or text file with counters list or trace parameters and provides output profiling data and statistics in various formats as text, CSV and JSON traces. Google Chrome tracing can be used to visualize the JSON traces with runtime API/activity timelines and per kernel counters data. +## 2. Profiling Modes +‘rocprof’ can be used for GPU profiling using HW counters and application tracing +### 2.1. GPU profiling +GPU profiling is controlled with input file which defines a list of metrics/counters and a profiling scope. An input file is provided using option ‘-i ’. Output CSV file with a line per submitted kernel is generated. Each line has kernel name, kernel parameters and counter values. By option ‘—stats’ the kernel execution stats can be generated in CSV format. Currently profiling has limitation of serializing submitted kernels. +An example of input file: +``` + # Perf counters group 1 + pmc : Wavefronts VALUInsts SALUInsts SFetchInsts + # Perf counters group 2 + pmc : TCC_HIT[0], TCC_MISS[0] + # Filter by dispatches range, GPU index and kernel names + # supported range formats: "3:9", "3:", "3" + range: 1 : 4 + gpu: 0 1 2 3 + kernel: simple Pass1 simpleConvolutionPass2 +``` +An example of profiling command line for ‘MatrixTranspose’ application +``` +$ rocprof -i input.txt MatrixTranspose +RPL: on '191018_011134' from '/…./rocprofiler_pkg' in '/…./MatrixTranspose' +RPL: profiling '"./MatrixTranspose"' +RPL: input file 'input.txt' +RPL: output dir '/tmp/rpl_data_191018_011134_9695' +RPL: result dir '/tmp/rpl_data_191018_011134_9695/input0_results_191018_011134' +ROCProfiler: rc-file '/…./rpl_rc.xml' +ROCProfiler: input from "/tmp/rpl_data_191018_011134_9695/input0.xml" + gpu_index = + kernel = + range = + 4 metrics + L2CacheHit, VFetchInsts, VWriteInsts, MemUnitStalled + 0 traces +Device name Ellesmere [Radeon RX 470/480/570/570X/580/580X] +PASSED! + +ROCPRofiler: 1 contexts collected, output directory /tmp/rpl_data_191018_011134_9695/input0_results_191018_011134 +RPL: '/…./MatrixTranspose/input.csv' is generated +``` +#### 2.1.1. Counters and metrics +There are two profiling features, metrics and traces. Hardware performance counters are treated as the basic metrics and the formulas can be defined for derived metrics. +Counters and metrics can be dynamically configured using XML configuration files with counters and metrics tables: + - Counters table entry, basic metric: counter name, block name, event id + - Derived metrics table entry: metric name, an expression for calculation the metric from the counters + +Metrics XML File Example: +``` + + + + . . . + + + + . . . + + + + + +``` +##### 2.1.1.1. Metrics query +Available counters and metrics can be queried by options ‘—list-basic’ for counters and ‘—list-derived’ for derived metrics. The output for counters indicates number of block instances and number of block counter registers. The output for derived metrics prints the metrics expressions. +Examples: +``` +$ rocprof --list-basic +RPL: on '191018_014450' from '/opt/rocm/rocprofiler' in '/…./MatrixTranspose' +ROCProfiler: rc-file '/…./rpl_rc.xml' +Basic HW counters: + gpu-agent0 : GRBM_COUNT : Tie High - Count Number of Clocks + block GRBM has 2 counters + gpu-agent0 : GRBM_GUI_ACTIVE : The GUI is Active + block GRBM has 2 counters + . . . + gpu-agent0 : TCC_HIT[0-15] : Number of cache hits. + block TCC has 4 counters + gpu-agent0 : TCC_MISS[0-15] : Number of cache misses. UC reads count as misses. + block TCC has 4 counters + . . . + +$ rocprof --list-derived +RPL: on '191018_015911' from '/opt/rocm/rocprofiler' in '/home/evgeny/work/BUILD/0_MatrixTranspose' +ROCProfiler: rc-file '/home/evgeny/rpl_rc.xml' +Derived metrics: + gpu-agent0 : TCC_HIT_sum : Number of cache hits. Sum over TCC instances. + TCC_HIT_sum = sum(TCC_HIT,16) + gpu-agent0 : TCC_MISS_sum : Number of cache misses. Sum over TCC instances. + TCC_MISS_sum = sum(TCC_MISS,16) + gpu-agent0 : TCC_MC_RDREQ_sum : Number of 32-byte reads. Sum over TCC instaces. + TCC_MC_RDREQ_sum = sum(TCC_MC_RDREQ,16) + . . . +``` +##### 2.1.1.2. Metrics collecting +Counters and metrics accumulated per kernel can be collected using input file with a list of metrics, see an example in 2.1. +Currently profiling has limitation of serializing submitted kernels. +The number of counters which can be dumped by one run is limited by GPU HW by number of counter registers per block. The number of counters can be different for different blocks and can be queried, see 2.1.1.1. +###### 2.1.1.2.1. Blocks instancing +GPU blocks are implemented as several identical instances. To dump counters of specific instance square brackets can be used, see an example in 2.1. +The number of block instances can be queried, see 2.1.1.1. +###### 2.1.1.2.2. HW limitations +The number of counters which can be dumped by one run is limited by GPU HW by number of counter registers per block. The number of counters can be different for different blocks and can be queried, see 2.1.1.1. + - Metrics groups + +To dump a list of metrics exceeding HW limitations the metrics list can be split on groups. +The tool supports automatic splitting on optimal metric groups: +``` +$ rocprof -i input.txt ./MatrixTranspose +RPL: on '191018_032645' from '/opt/rocm/rocprofiler' in '/…./MatrixTranspose' +RPL: profiling './MatrixTranspose' +RPL: input file 'input.txt' +RPL: output dir '/tmp/rpl_data_191018_032645_12106' +RPL: result dir '/tmp/rpl_data_191018_032645_12106/input0_results_191018_032645' +ROCProfiler: rc-file '/…./rpl_rc.xml' +ROCProfiler: input from "/tmp/rpl_data_191018_032645_12106/input0.xml" + gpu_index = + kernel = + range = + 20 metrics + Wavefronts, VALUInsts, SALUInsts, SFetchInsts, FlatVMemInsts, LDSInsts, FlatLDSInsts, GDSInsts, VALUUtilization, FetchSize, WriteSize, L2CacheHit, VWriteInsts, GPUBusy, VALUBusy, SALUBusy, MemUnitStalled, WriteUnitStalled, LDSBankConflict, MemUnitBusy + 0 traces +Device name Ellesmere [Radeon RX 470/480/570/570X/580/580X] + +Input metrics out of HW limit. Proposed metrics group set: + group1: L2CacheHit VWriteInsts MemUnitStalled WriteUnitStalled MemUnitBusy FetchSize FlatVMemInsts LDSInsts VALUInsts SALUInsts SFetchInsts FlatLDSInsts GPUBusy Wavefronts + group2: WriteSize GDSInsts VALUUtilization VALUBusy SALUBusy LDSBankConflict + +ERROR: rocprofiler_open(), Construct(), Metrics list exceeds HW limits + +Aborted (core dumped) +Error found, profiling aborted. +``` + - Collecting with multiple runs + +To collect several metric groups a full application replay is used by defining several ‘pmc:’ lines in the input file, see 2.1. + +### 2.2. Application tracing +Supported application tracing includes runtime API and GPU activity tracing’ +Supported runtimes are: ROCr (HSA API) and HIP +Supported GPU activity: kernel execution, async memory copy, barrier packets. +The trace is generated in JSON format compatible with Chrome tracing. +The trace consists of several sections with timelines for API trace per thread and GPU activity. The timelines events show event name and parameters. +Supported options: ‘—hsa-trace’, ‘—hip-trace’, ‘—sys-trace’, where ‘sys trace’ is for HIP and HSA combined trace. +#### 2.2.1. HIP runtime trace +The trace is generated by option ‘—hip-trace’ and includes HIP API timelines and GPU activity at the runtime level. +#### 2.2.2. ROCr runtime trace +The trace is generated by option ‘—hsa-trace’ and includes ROCr API timelines and GPU activity at AQL queue level. Also, can provide counters per kernel. +#### 2.2.3. KFD driver trace +The trace is generated by option ‘—kfd-trace’ and includes KFD Thunk API timeline. +It is planned to add memory allocations/migration tracing. +#### 2.2.4. Code annotation +Support for application code annotation. +Start/stop API is supported to programmatically control the profiling. +A ‘roctx’ library provides annotation API. Annotation is visualized in JSON trace as a separate "Markers and Ranges" timeline section. +##### 2.2.4.1. Start/stop API +``` +// Tracing start API +void roctracer_start(); + +// Tracing stop API +void roctracer_stop(); +``` +##### 2.2.4.2. rocTX basic markers API +``` +// A marker created by given ASCII massage +void roctxMark(const char* message); + +// Returns the 0 based level of a nested range being started by given message associated to this range. +// A negative value is returned on the error. +int roctxRangePush(const char* message); + +// Marks the end of a nested range. +// Returns the 0 based level the range. +// A negative value is returned on the error. +int roctxRangePop(); +``` +### 2.3. Multiple GPUs profiling +The profiler supports multiple GPU’s profiling and provide GPI id for counters and kernels data in CSV output file. Also, GPU id is indicating for respective GPU activity timeline in JSON trace. +## 3. Profiling control +Profiling can be controlled by specifying a profiling scope, by filtering trace events and specifying interesting time intervals. +### 3.1. Profiling scope +Counters profiling scope can be specified by GPU id list, kernel name substrings list and dispatch range. +Supported range formats examples: "3:9", "3:", "3". You can see an example of input file in 2.1. +#### 3.2. Tracing control +Tracing can be filtered by events names using profiler input file and by enabling interesting time intervals by command line option. +#### 3.2.1. Filtering traced APIs +A list of traced API names can be specified in profiler input file. +An example of input file line for ROCr runtime trace (HAS API): +``` +hsa: hsa_queue_create hsa_amd_memory_pool_allocate +``` +#### 3.2.2. Tracing time period +Trace can be dumped periodically with initial delay, dumping period length and rate: +``` +--trace-period +``` +### 3.3. Concurrent kernels +Currently concurrent kernels profiling is not supported which is a planned feature. Kernels are serialized. +### 3.4. Multi-processes profiling +Multi-processes profiling is not currently supported. +### 3.5. Errors logging +Profiler errors are logged to global logs: +``` +/tmp/aql_profile_log.txt +/tmp/rocprofiler_log.txt +/tmp/roctracer_log.txt +``` +## 4. 3rd party visualization tools +‘rocprof’ is producing JSON trace compatible with Chrome Tracing, which is an internal trace visualization tool in Google Chrome. +### 4.1. Chrome tracing +Good review can be found by the link: https://aras-p.info/blog/2017/01/23/Chrome-Tracing-as-Profiler-Frontend/ +## 5. Command line options +The command line options can be printed with option ‘-h’: +``` +$ rocprof -h +RPL: on '191018_023018' from '/opt/rocm/rocprofiler' in '/…./MatrixTranspose' +ROCm Profiling Library (RPL) run script, a part of ROCprofiler library package. +Full path: /opt/rocm/rocprofiler/bin/rocprof +Metrics definition: /opt/rocm/rocprofiler/lib/metrics.xml + +Usage: + rocprof [-h] [--list-basic] [--list-derived] [-i ] [-o ] + +Options: + -h - this help + --verbose - verbose mode, dumping all base counters used in the input metrics + --list-basic - to print the list of basic HW counters + --list-derived - to print the list of derived metrics with formulas + --cmd-qts - quoting profiled cmd-line [on] + + -i <.txt|.xml file> - input file + Input file .txt format, automatically rerun application for every pmc line: + + # Perf counters group 1 + pmc : Wavefronts VALUInsts SALUInsts SFetchInsts FlatVMemInsts LDSInsts FlatLDSInsts GDSInsts VALUUtilization FetchSize + # Perf counters group 2 + pmc : WriteSize L2CacheHit + # Filter by dispatches range, GPU index and kernel names + # supported range formats: "3:9", "3:", "3" + range: 1 : 4 + gpu: 0 1 2 3 + kernel: simple Pass1 simpleConvolutionPass2 + + Input file .xml format, for single profiling run: + + # Metrics list definition, also the form ":" can be used + # All defined metrics can be found in the 'metrics.xml' + # There are basic metrics for raw HW counters and high-level metrics for derived counters + + + # Filter by dispatches range, GPU index and kernel names + + + -o - output CSV file [.csv] + -d - directory where profiler store profiling data including traces [/tmp] + The data directory is renoving autonatically if the directory is matching the temporary one, which is the default. + -t - to change the temporary directory [/tmp] + By changing the temporary directory you can prevent removing the profiling data from /tmp or enable removing from not '/tmp' directory. + + --basenames - to turn on/off truncating of the kernel full function names till the base ones [off] + --timestamp - to turn on/off the kernel disoatches timestamps, dispatch/begin/end/complete [off] + --ctx-wait - to wait for outstanding contexts on profiler exit [on] + --ctx-limit - maximum number of outstanding contexts [0 - unlimited] + --heartbeat - to print progress heartbeats [0 - disabled] + --obj-tracking - to turn on/off kernels code objects tracking [off] + + --stats - generating kernel execution stats, file .stats.csv + + --roctx-trace - to enable rocTX application code annotation trace, "Markers and Ranges" JSON trace section. + --sys-trace - to trace HIP/HSA APIs and GPU activity, generates stats and JSON trace chrome-tracing compatible + --hip-trace - to trace HIP, generates API execution stats and JSON file chrome-tracing compatible + --hsa-trace - to trace HSA, generates API execution stats and JSON file chrome-tracing compatible + --kfd-trace - to trace KFD, generates API execution stats and JSON file chrome-tracing compatible + Generated files: ._stats.txt .json + Traced API list can be set by input .txt or .xml files. + Input .txt: + hsa: hsa_queue_create hsa_amd_memory_pool_allocate + Input .xml: + + + + + + --trace-start - to enable tracing on start [on] + --trace-period - to enable trace with initial delay, with periodic sample length and rate + Supported time formats: + +Configuration file: + You can set your parameters defaults preferences in the configuration file 'rpl_rc.xml'. The search path sequence: .:/home/evgeny: + First the configuration file is looking in the current directory, then in your home, and then in the package directory. + Configurable options: 'basenames', 'timestamp', 'ctx-limit', 'heartbeat', 'obj-tracking'. + An example of 'rpl_rc.xml': + +``` +## 6. Publicly available counters and metrics +The following counters are publicly available for commercially available VEGA10/20 GPUs. + +Counters: +``` +• GRBM_COUNT : Tie High - Count Number of Clocks +• GRBM_GUI_ACTIVE : The GUI is Active +• SQ_WAVES : Count number of waves sent to SQs. (per-simd, emulated, global) +• SQ_INSTS_VALU : Number of VALU instructions issued. (per-simd, emulated) +• SQ_INSTS_VMEM_WR : Number of VMEM write instructions issued (including FLAT). (per-simd, emulated) +• SQ_INSTS_VMEM_RD : Number of VMEM read instructions issued (including FLAT). (per-simd, emulated) +• SQ_INSTS_SALU : Number of SALU instructions issued. (per-simd, emulated) +• SQ_INSTS_SMEM : Number of SMEM instructions issued. (per-simd, emulated) +• SQ_INSTS_FLAT : Number of FLAT instructions issued. (per-simd, emulated) +• SQ_INSTS_FLAT_LDS_ONLY : Number of FLAT instructions issued that read/wrote only from/to LDS (only works if EARLY_TA_DONE is enabled). (per-simd, emulated) +• SQ_INSTS_LDS : Number of LDS instructions issued (including FLAT). (per-simd, emulated) +• SQ_INSTS_GDS : Number of GDS instructions issued. (per-simd, emulated) +• SQ_WAIT_INST_LDS : Number of wave-cycles spent waiting for LDS instruction issue. In units of 4 cycles. (per-simd, nondeterministic) +• SQ_ACTIVE_INST_VALU : regspec 71? Number of cycles the SQ instruction arbiter is working on a VALU instruction. (per-simd, nondeterministic) +• SQ_INST_CYCLES_SALU : Number of cycles needed to execute non-memory read scalar operations. (per-simd, emulated) +• SQ_THREAD_CYCLES_VALU : Number of thread-cycles used to execute VALU operations (similar to INST_CYCLES_VALU but multiplied by # of active threads). (per-simd) +• SQ_LDS_BANK_CONFLICT : Number of cycles LDS is stalled by bank conflicts. (emulated) +• TA_TA_BUSY[0-15] : TA block is busy. Perf_Windowing not supported for this counter. +• TA_FLAT_READ_WAVEFRONTS[0-15] : Number of flat opcode reads processed by the TA. +• TA_FLAT_WRITE_WAVEFRONTS[0-15] : Number of flat opcode writes processed by the TA. +• TCC_HIT[0-15] : Number of cache hits. +• TCC_MISS[0-15] : Number of cache misses. UC reads count as misses. +• TCC_EA_WRREQ[0-15] : Number of transactions (either 32-byte or 64-byte) going over the TC_EA_wrreq interface. Atomics may travel over the same interface and are generally classified as write requests. This does not include probe commands. +• TCC_EA_WRREQ_64B[0-15] : Number of 64-byte transactions going (64-byte write or CMPSWAP) over the TC_EA_wrreq interface. +• TCC_EA_WRREQ_STALL[0-15] : Number of cycles a write request was stalled. +• TCC_EA_RDREQ[0-15] : Number of TCC/EA read requests (either 32-byte or 64-byte) +• TCC_EA_RDREQ_32B[0-15] : Number of 32-byte TCC/EA read requests +• TCP_TCP_TA_DATA_STALL_CYCLES[0-15] : TCP stalls TA data interface. Now Windowed. +``` + +The following derived metrics have been defined and the profiler metrics XML specification can be found at: https://github.com/ROCm-Developer-Tools/rocprofiler/blob/amd-master/test/tool/metrics.xml. + +Metrics: +``` +• TA_BUSY_avr : TA block is busy. Average over TA instances. +• TA_BUSY_max : TA block is busy. Max over TA instances. +• TA_BUSY_min : TA block is busy. Min over TA instances. +• TA_FLAT_READ_WAVEFRONTS_sum : Number of flat opcode reads processed by the TA. Sum over TA instances. +• TA_FLAT_WRITE_WAVEFRONTS_sum : Number of flat opcode writes processed by the TA. Sum over TA instances. +• TCC_HIT_sum : Number of cache hits. Sum over TCC instances. +• TCC_MISS_sum : Number of cache misses. Sum over TCC instances. +• TCC_EA_RDREQ_32B_sum : Number of 32-byte TCC/EA read requests. Sum over TCC instances. +• TCC_EA_RDREQ_sum : Number of TCC/EA read requests (either 32-byte or 64-byte). Sum over TCC instances. +• TCC_EA_WRREQ_sum : Number of transactions (either 32-byte or 64-byte) going over the TC_EA_wrreq interface. Sum over TCC instances. +• TCC_EA_WRREQ_64B_sum : Number of 64-byte transactions going (64-byte write or CMPSWAP) over the TC_EA_wrreq interface. Sum over TCC instances. +• TCC_WRREQ_STALL_max : Number of cycles a write request was stalled. Max over TCC instances. +• TCC_MC_WRREQ_sum : Number of 32-byte effective writes. Sum over TCC instaces. +• FETCH_SIZE : The total kilobytes fetched from the video memory. This is measured with all extra fetches and any cache or memory effects taken into account. +• WRITE_SIZE : The total kilobytes written to the video memory. This is measured with all extra fetches and any cache or memory effects taken into account. +• GPUBusy : The percentage of time GPU was busy. +• Wavefronts : Total wavefronts. +• VALUInsts : The average number of vector ALU instructions executed per work-item (affected by flow control). +• SALUInsts : The average number of scalar ALU instructions executed per work-item (affected by flow control). +• VFetchInsts : The average number of vector fetch instructions from the video memory executed per work-item (affected by flow control). Excludes FLAT instructions that fetch from video memory. +• SFetchInsts : The average number of scalar fetch instructions from the video memory executed per work-item (affected by flow control). +• VWriteInsts : The average number of vector write instructions to the video memory executed per work-item (affected by flow control). Excludes FLAT instructions that write to video memory. +• FlatVMemInsts : The average number of FLAT instructions that read from or write to the video memory executed per work item (affected by flow control). Includes FLAT instructions that read from or write to scratch. +• LDSInsts : The average number of LDS read or LDS write instructions executed per work item (affected by flow control). Excludes FLAT instructions that read from or write to LDS. +• FlatLDSInsts : The average number of FLAT instructions that read or write to LDS executed per work item (affected by flow control). +• GDSInsts : The average number of GDS read or GDS write instructions executed per work item (affected by flow control). +• VALUUtilization : The percentage of active vector ALU threads in a wave. A lower number can mean either more thread divergence in a wave or that the work-group size is not a multiple of 64. Value range: 0% (bad), 100% (ideal - no thread divergence). +• VALUBusy : The percentage of GPUTime vector ALU instructions are processed. Value range: 0% (bad) to 100% (optimal). +• SALUBusy : The percentage of GPUTime scalar ALU instructions are processed. Value range: 0% (bad) to 100% (optimal). +• Mem32Bwrites : +• FetchSize : The total kilobytes fetched from the video memory. This is measured with all extra fetches and any cache or memory effects taken into account. +• WriteSize : The total kilobytes written to the video memory. This is measured with all extra fetches and any cache or memory effects taken into account. +• L2CacheHit : The percentage of fetch, write, atomic, and other instructions that hit the data in L2 cache. Value range: 0% (no hit) to 100% (optimal). +• MemUnitBusy : The percentage of GPUTime the memory unit is active. The result includes the stall time (MemUnitStalled). This is measured with all extra fetches and writes and any cache or memory effects taken into account. Value range: 0% to 100% (fetch-bound). +• MemUnitStalled : The percentage of GPUTime the memory unit is stalled. Try reducing the number or size of fetches and writes if possible. Value range: 0% (optimal) to 100% (bad). +• WriteUnitStalled : The percentage of GPUTime the Write unit is stalled. Value range: 0% to 100% (bad). +• ALUStalledByLDS : The percentage of GPUTime ALU units are stalled by the LDS input queue being full or the output queue being not ready. If there are LDS bank conflicts, reduce them. Otherwise, try reducing the number of LDS accesses if possible. Value range: 0% (optimal) to 100% (bad). +• LDSBankConflict : The percentage of GPUTime LDS is stalled by bank conflicts. Value range: 0% (optimal) to 100% (bad). +``` From 06b07a5e1dd00d1dbbfefa585d1769d8c30f3e28 Mon Sep 17 00:00:00 2001 From: Ammar ELWazir Date: Thu, 24 Feb 2022 10:35:21 -0600 Subject: [PATCH 149/153] Create rocprofiler_spec.md --- doc/rocprofiler_spec.md | 837 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 837 insertions(+) create mode 100644 doc/rocprofiler_spec.md diff --git a/doc/rocprofiler_spec.md b/doc/rocprofiler_spec.md new file mode 100644 index 00000000..975d58ca --- /dev/null +++ b/doc/rocprofiler_spec.md @@ -0,0 +1,837 @@ +# ROC Profiler Library Specification +ROC Profiler API version 7 + +## 1. High level overview +``` +The goal of the implementation is to provide a HW specific low-level performance analysis +interface for profiling of GPU compute applications. The profiling includes HW performance +counters with complex performance metrics and HW traces. The implementation distinguishes +two profiling features, metrics and traces. HW performance counters are treated as the basic +metrics and the formulas can be defined for derived complex metrics. +The library can be loaded by HSA runtime as a tool plugin and it can be loaded by higher +level HW independent performance analysis API like PAPI. +The library has C API and is based on AQLprofile AMD specific HSA extension. + + 1. The library provides methods to query the list of supported HW features. + 2. The library provides profiling APIs to start, stop, read metrics results and tracing + data. + 3. The library provides a intercepting API for collecting per-kernel profiling data for + the kernels + dispatched to HSA AQL queues. + 4. The library provides mechanism to load profiling tool library plugin by env variable + ROCP_TOOL_LIB. + 5. The library is responsible for allocation of the buffers for profiling and notifying + about output data buffer overflow for traces. + 6. The library is implemented based on AMD specific AQLprofile HSA extension. + 7. The library implementation is abstracted from the specific GFXIP. + 8. The library implementation is extensible: + - Easy adding of counters and metrics + - Counters enumeration + - Counters and metrics can be dynamically configured using XML configuration files with + counters and metrics tables: + o Counters table entry, basic metric: counter name, block name, event id + o Complex metrics table entry: metric name, an expression for calculation the metric + from the counters + +Metrics XML file example: + + + + . . . + + + + . . . + + + + + +``` +## 2. Environment +``` +* HSA_TOOLS_LIB - required to be set to the name of rocprofiler library to be loaded by +HSA runtime +* ROCP_METRICS - path to the metrics XML file +* ROCP_TOOL_LIB - path to profiling tool library loaded by ROC Profiler +* ROCP_HSA_INTERCEPT - if set then HSA dispatches intercepting is enabled +``` +## 3. General API +### 3.1. Description +``` +The library supports method for getting the error number and error string of the last +failed library API call. +To check the conformance of used library APi header and the library binary the version +macros and API methods can be used. + +Returning the error and error string methods: +- rocprofiler_error_string - method for returning the error string + +Library version: +- ROCPROFILER_VERSION_MAJOR - API major version macro +- ROCPROFILER_VERSION_MINOR - API minor version macro +- rocprofiler_version_major - library major version +- rocprofiler_version_minor - library minor version +``` +### 3.2. Returning the error and error string methods +``` +const char* rocprofiler_error_string(); +``` +### 3.3. Library version +``` +The library provides back compatibility if the library major version is less or equal +then the API major version macro. + +API version macros defined in the library API header 'rocprofiler.h': + +ROCPROFILER_VERSION_MAJOR +ROCPROFILER_VERSION_MINOR + +Methods to check library major and minor venison: + +uint32_t rocprofiler_major_version(); +uint32_t rocprofiler_minor_version(); +``` +## 4. Backend API +### 4.1. Description +``` +The library provides the methods to open/close profiling context, to start, stop and read +HW performance counters and traces, to intercept kernel dispatches to collect per-kernel +profiling data. Also the library provides methods to calculate complex performance metrics +and to query the list of available metrics. The library distinguishes two profiling features, +metrics and traces, where HW performance counters are treated as the basic metrics. To check +if there was an error the library methods return HSA standard status code. +For a given context the profiling can be started/stopped and counters sampled in standalone +mode or profiling can be initiated by intercepting the kernel dispatches with registering +a dispatch callback. +For counters sampling, which is the usage model of higher level APIs like PAPI, +the start/stop/read APIs should be used. +For collecting per-kernel data for the submitted to HSA queues kernels the dispatch callback +API should be used. +The library provides back compatibility if the library major version is less or equal. + +Returned API status: +- hsa_status_t - HSA status codes are used from hsa.h header + +Loading and Configuring, loadable plugin on-load/unload methods: +- rocprofiler_settings_t – global properties +- OnLoadTool +- OnLoadToolProp +- OnUnloadTool + +Info API: +- rocprofiler_info_kind_t - profiling info kind +- rocprofiler_info_query_t - profiling info query +- rocprofiler_info_data_t - profiling info data +- rocprofiler_get_info - return the info for a given info kind +- rocprofiler_iterote_inf_ - iterate over the info for a given info kind +- rocprofiler_query_info - iterate over the info for a given info query + +Context API: +- rocprofiler_t - profiling context handle +- rocprofiler_feature_kind_t - profiling feature kind +- rocprofiler_feature_parameter_t - profiling feature parameter +- rocprofiler_data_kind_t - profiling data kind +- rocprofiler_data_t - profiling data +- rocprofiler_feature_t - profiling feature +- rocprofiler_mode_t - profiling modes +- rocprofiler_properties_t - profiler properties +- rocprofiler_open - open new profiling context +- rocprofiler_close - close profiling context and release all allocated resources +- rocprofiler_group_count - return profiling groups count +- rocprofiler_get_group - return profiling group for a given index +- rocprofiler_get_metrics - method for calculating the metrics data +- rocprofiler_iterate_trace_data - method for iterating output trace data instances +- rocprofiler_time_id_t - supported time value ID enumeration +- rocprofiler_get_time – return time for a given time ID and profiling timestamp value + +Sampling API: +- rocprofiler_start - start profiling +- rocprofiler_stop - stop profiling +- rocprofiler_read - read profiling data to the profiling features objects +- rocprofiler_get_data - wait for profiling data + Group versions of start/stop/read/get_data methods: + o rocprofiler_group_start + o rocprofiler_group_stop + o rocprofiler_group_read + o rocprofiler_group_get_data + +Intercepting API: +- rocprofiler_callback_t - profiling callback type +- rocprofiler_callback_data_t - profiling callback data type +- rocprofiler_dispatch_record_t – dispatch record +- rocprofiler_queue_callbacks_t – queue callbacks, dispatch/destroy +- rocprofiler_set_queue_callbacks - set queue kernel dispatch and queue destroy callbacks +- rocprofiler_remove_queue_callbacks - remove queue callbacks + +Context pool API: +- rocprofiler_pool_t – context pool handle +- rocprofiler_pool_entry_t – context pool entry +- rocprofiler_pool_properties_t – context pool properties +- rocprofiler_pool_handler_t – context pool completion handler +- rocprofiler_pool_open - context pool open +- rocprofiler_pool_close - context pool close +- rocprofiler_pool_fetch – fetch and empty context entry to pool +- rocprofiler_pool_release – release a context entry +- rocprofiler_pool_iterate – iterated fetched context entries +- rocprofiler_pool_flush – flush completed context entries +``` +### 4.2. Loading and Configuring +``` +Loading and Configuring +The profiling properties can be set by profiler plugin on loading by ROC runtime. +The profiler library plugin can be set by ROCP_TOOL_LIB env var. + +Global properties: + +typedef struct { + uint32_t intercept_mode; + uint64_t timeout; + uint32_t timestamp_on; +} rocprofiler_settings_t; + +On load/unload methods defined in profiling tool library loaded by ROCP_TOOL_LIB env var: +extern "C" void OnLoadTool(); +extern "C" void OnLoadToolProp(rocprofiler_settings_t* settings); +extern "C" void OnUnloadTool(); + +``` +### 4.3. Info API +``` +The profiling metrics are defined by name and the traces are defined by name and parameters. +All supported features can be iterated using 'iterate_info/query_info' methods. The counter +names are defined in counters table configuration file, each counter has a unique name and +defined by block name and event id. The traces and trace parameters names are same as in +the hardware documentation and the parameters codes are rocprofiler_feature_parameter_t values, +see below in the "Context API" section. + +Profiling info kind: + +typedef enum { + ROCPROFILER_INFO_KIND_METRIC = 0, // metric info + ROCPROFILER_INFO_KIND_METRIC_COUNT = 1, // metrics count + ROCPROFILER_INFO_KIND_TRACE = 2, // trace info + ROCPROFILER_INFO_KIND_TRACE_COUNT = 3, // traces count +} rocprofiler_info_kind_t; + +Profiling info data: + +typedef struct { + rocprofiler_info_kind_t kind; // info data kind + union { + struct { + const char* name; // metric name + uint32_t instances; // instances number + const char* expr; // metric expression, NULL for basic counters + const char* description; // metric description + const char* block_name; // block name + uint32_t block_counters; // number of block counters + } metric; + struct { + const char* name; // trace name + const char* description; // trace description + uint32_t parameter_count; // supported by the trace number + // parameters + } trace; + }; +} rocprofiler_info_data_t; + +Return info for a given info kind: + +has_status_t rocprofiler_get_info( + const hsa_agent_t* agent, // [in] GPU handle, NULL for all + // GPU agents + rocprofiler info_kind_t kind, // kind of iterated info + void *data); // data passed to callback + +Iterate over the info for a given info kind, and invoke an application-defined callback on +every iteration: + +has_status_t rocprofiler_iterate_info( + const hsa_agent_t* agent, // [in] GPU handle, NULL for all + // GPU agents + rocprofiler info_kind_t kind, // kind of iterated info + hsa_status_t (*callback)(const rocprofiler_info_data_t info, void *data), // callback + void *data); + +Iterate over the info for a given info query, and invoke an application-defined callback on +every iteration. The query +fields set to NULL define the query wildcard: + +has_status_t rocprofiler_query_info( + const hsa_agent_t* agent, // [in] GPU handle, NULL for all + // GPU agents + rocprofiler info_kind_t kind, // kind of iterated info + rocprofiler_info_data_t query, // info query + hsa_status_t (*callback)(const rocprofiler_info_data_t info, void *data), // callback + void *data); // data passed to callback +``` +### 4.4. Context API +``` +Profiling context is accumulating all profiling information including profiling features +which carry profiling data, required buffers for profiling command packets and output data. +The context can be created and deleted by the library open/close methods. By deleting +the context all accumulated by the library resources associated with this context will be +released. If it is required more than one run to collect all requested counters data then +data for all profiling groups should be collected and then the metrics can be calculated by +loading the saved groups' data to the profiling context. Saving and loading of the groups +data is responsibility of the tool. The groups are automatically identified on the profiling +context open and there is API to access them, see the "Profiling groups" section below. + +Profiling context handle: + +typename rocprofiler_t; + +Profiling feature kind: + +typedef enum { + ROCPROFILER_FEATURE_KIND_METRIC = 0, // metric + ROCPROFILER_FEATURE_KIND_TRACE = 1 // trace +} rocprofiler_feature_kind_t; + +Profiling feature parameter: + +typedef hsa_ven_amd_aqlprofile_parameter_t rocprofiler_feature_parameter_t; + +Profiling data kind: + +typedef enum { + ROCPROFILER_DATA_KIND_UNINIT = 0, // data uninitialized + ROCPROFILER_DATA_KIND_INT32 = 1, // 32bit integer + ROCPROFILER_DATA_KIND_INT64 = 2, // 64bit integer + ROCPROFILER_DATA_KIND_FLOAT = 3, // float single-precision result + ROCPROFILER_DATA_KIND_DOUBLE = 4, // float double-precision result + ROCPROFILER_DATA_KIND_BYTES = 5 // trace output as a bytes array +} rocprofiler_data_kind_t; + + +Profiling data: + +typedef struct { + rocprofiler_data_kind_t kind; // result kind + union { + uint32_t result_int32; // 32bit integer result + uint64_t result_int64; // 64bit integer result + float result_float; // float single-precision result + double result_double; // float double-precision result + typedef struct { + void* ptr; // pointer + uint32_t size; // byte size + uint32_t instances; // number of trace instances + } result_bytes; // data by ptr and byte size + }; +} rocprofiler_data_t; + +Profiling feature: + +typedef struct { + rocprofiler_feature_kind_t type; // feature type + const char* name; // feature name + const rocprofiler_feature_parameter_t* parameters; // feature parameters + uint32_t parameter_count; // feature parameter count + rocprofiler_data_t* data; // profiling data +} rocprofiler_feature_t; + +Profiling mode masks: +There are several modes which can be specified for the profiling context. +STANDALONE mode can be used for the counters sampling in another then application context +to support statistical system wide profiling. In this mode the profiling context supports +its own queue which can be created on the context open if the CREATEQUEUE mode also specified. +See also "Profiler properties" section below for the standalone mode queue properties. +The profiler supports several profiling groups for collecting profiling data in several +runs and 'SINGLEGROUP' mode allows only one group and the context open will fail if more +groups are needed. + +typedef enum { + ROCPROFILER_MODE_STANDALONE = 1, // standalone mode when ROC profiler + // supports own AQL queue + ROCPROFILER_MODE_CREATEQUEUE = 2, // profiler creates queue in STANDALONE mode + ROCPROFILER_MODE_SINGLEGROUP = 4 // profiler allows one group only and fails + // if more groups are needed +} rocprofiler_mode_t; + +Context data readiness callback: + +typedef void (*rocprofiler_context_callback_t)( + rocprofiler_group_t* group, // profiling group + void* arg); // callback arg + +Profiler properties: +There are several properties which can be specified for the context. A callback can be +registered which will be called when the context data is ready. In standalone profiling mode +'ROCPROFILER_MODE_STANDALONE' the context supports its own queue and the queue can be set by +the property 'queue' or a queue will be created with the specified depth 'queue_depth' if mode +'ROCPROFILER_MODE_CREATEQUEUE' also specified. + +typedef struct { + rocprofiler_context_callback_t callback; // callback on the context data readiness + void* callback_arg; // callback arg + has_queue_t* queue; // HSA queue for standalone mode + uint32_t queue_depth; // created queue depth,for create-queue mode +} rocprofiler_properties_t; + +Open/close profiling context: + +hsa_status_t rocprofiler_open( + hsa_agent_t agent, // GPU handle + rocprofiler_feature_t* features, // [in/out] profiling feature array + uint32_t feature_count, // profiling feature count + rocprofiler_t** context, // [out] profiling context handle + uint32_t mode, // profiling mode mask + rocprofiler_properties_t* properties); // profiler properties + +hsa_status_t rocprofiler_close( + rocprofiler_t* context); // [in] profiling context + +Profiling groups: +The profiler on the context open automatically identifies a required number of the application +runs to collect all data needed for all specified metrics and creates a metric group per each +run. Data for all profiling groups should be collected and then the metrics can be calculated +by loading the saved groups' data to the profiling context. Saving and loading of he groups +data is responsibility of the tool. + +typedef struct { + uint32_t index; // profiling group index + rocprofiler_feature_t** features; // profiling features array + uint32_t feature_count; // profiling feature count + rocprofiler_t* context; // profiling context handle +} rocprofiler_group_t; + +Return profiling groups count: + +hsa_status_t rocprofiler_group_count( + rocprofiler_t* context); // [in/out] profiling context + uint32* count); // [out] profiling groups count + +Return the profiling group for a given index: + +hsa_status_t rocprofiler_get_group( + rocprofiler_t* context, // [in/out] profiling context, + // will be returned as + // a part of the group structure + uint32_t index, // [in] group index + rocprofiler_group_t* group); // [out] profiling group + +Calculate metrics data. The data will be stored to the registered profiling features data fields: +After all profiling context data is ready the registered metrics can be calculated. The context +data readiness can be checked by 'get_data' API or using the context callback. + +hsa_status_t rocprofiler_get_metrics( + rocprofiler_t* context); // [in/out] profiling context + +Method for iterating trace data instances: +Trace data can have several instance, for example, one instance per Shader Engine. + +hsa_status_t rocprofiler_iterate_trace_data( + const rocprofiler_t* contex, // [in] context object + hsa_ven_amd_aqlprofile_data_callback_t callback, // [in] callback to iterate + // the output data + void* callback_data); // [in/out] passed to callback data + +Converting of profiling timestamp to time value for suported time ID. +Supported time value ID enumeration: +typedef enum { + ROCPROFILER_TIME_ID_CLOCK_REALTIME = 0, // Linux realtime clock time + ROCPROFILER_TIME_ID_CLOCK_MONOTONIC = 1, // Linux monotonic clock time +} rocprofiler_time_id_t; + +Method for converting of profiling timestamp to time value for a given time ID: +hsa_status_t rocprofiler_get_time( + rocprofiler_time_id_t time_id, // identifier of the particular + // time to convert the timestamp + uint64_t timestamp, // profiling timestamp + uint64_t* value_ns); // [out] returned time ‘ns’ value +``` +### 4.5. Sampling API +``` +The API supports the counters sampling usage model with start/read/stop methods and also lets +to wait for the profiling data in the intercepting usage model with get_data method. + +Start/stop/read methods: + +hsa_status_t rocprofiler_start( + rocprofiler_t* context, // [in/out] profiling context + uint32_t group_index = 0); // group index + +hsa_status_t rocprofiler_stop( + rocprofiler_t* context, // [in/out] profiling context + uint32_t group_index = 0); // group index + +hsa_status_t rocprofiler_read( + rocprofiler_t* context, // [in/out] profiling context + uint32_t group_index = 0); // group index + +Wait for profiling data: + +hsa_status_t rocprofiler_get_data( + rocprofiler_t* context, // [in/out] profiling context + uint32_t group_index = 0); // group index + +Group versions of the above start/stop/read/get_data methods: + +hsa_status_t rocprofiler_group_start( + rocprofiler_group_t* group); // [in/out] profiling group + +hsa_status_t rocprofiler_group_stop( + rocprofiler_group_t* group); // [in/out] profiling group + + +hsa_status_t rocprofiler_group_read( + rocprofiler_group_t* group); // [in/out] profiling group + + +hsa_status_t rocprofiler_group_get_data( + rocprofiler_group_t* group); // [in/out] profiling group +``` +### 4.6. Intercepting API +``` +The library provides a callback API for enabling profiling for the kernels dispatched to +HSA AQL queues. The API enables per-kernel profiling data collection. +Currently implemented the option with serializing the kernels execution. + +ROC profiler callback type: + +hsa_status_t (*rocprofiler_callback_t)( + const rocprofiler_callback_data_t* callback_data, // callback data passed by HSA runtime + void* user_data, // [in/out] user data passed + // to the callback + rocprofiler_group** group); // [out] returned profiling group + +Profiling callback data: + +typedef struct { + uint64_t dispatch; // dispatch timestamp + uint64_t begin; // begin timestamp + uint64_t end; // end timestamp + uint64_t complete; // completion signal timestamp +} rocprofiler_dispatch_record_t; + +typedef struct { + hsa_agent_t agent; // GPU agent handle + uint32_t agent_index; // GPU index + const hsa_queue_t* queue; // HSA queue + uint64_t queue_index; // Index in the queue + const hsa_kernel_dispatch_packet_t* packet; // HSA dispatch packet + const char* kernel_name; // Kernel name + const rocprofiler_dispatch_record_t* record; // Dispatch record +} rocprofiler_callback_data_t; + +Queue callbacks: + +typedef struct { + rocprofiler_callback_t dispatch; // kernel dispatch callback + hsa_status_t (*destroy)(hsa_queue_t* queue, void* data); // queue destroy callback +} rocprofiler_queue_callbacks_t; + +Adding/removing kernel dispatch and queue destroy callbacks + +hsa_status_t rocprofiler_set_intercepting( + rocprofiler_intercepting_t callbacks, // intercepting callbacks + void* data); // [in/out] passed callbacks data + +hsa_status_t rocprofiler_remove_intercepting(); +``` +### 4.7. Profiling Context Pools +``` +The API provide capability to create a context pool for a given agent and a set of features, to fetch/release a context entry, to register a callback for pool’s contexts completion. +Profiling pool handle: +typename rocprofiler_pool_t; +Profiling pool entry: +typedef struct { + rocprofiler_t* context; // context object + void* payload; // payload data object +} rocprofiler_pool_entry_t; + +Profiling handler, calling on profiling completion: +typedef bool (*rocprofiler_pool_handler_t)(const rocprofiler_pool_entry_t* entry, void* arg); + +Profiling properties: +typedef struct { + uint32_t num_entries; // pool size entries + uint32_t payload_bytes; // payload size bytes + rocprofiler_pool_handler_t handler; // handler on context completion + void* handler_arg; // the handler arg +} rocprofiler_pool_properties_t; + +Open profiling pool: +hsa_status_t rocprofiler_pool_open( + hsa_agent_t agent, // GPU handle + rocprofiler_feature_t* features, // [in] profiling features array + uint32_t feature_count, // profiling info count + rocprofiler_pool_t** pool, // [out] context object + uint32_t mode, // profiling mode mask + rocprofiler_pool_properties_t*); // pool properties + +Close profiling pool: +hsa_status_t rocprofiler_pool_close( + rocprofiler_pool_t* pool); // profiling pool handle + +Fetch profiling pool entry: +hsa_status_t rocprofiler_pool_fetch( + rocprofiler_pool_t* pool, // profiling pool handle + rocprofiler_pool_entry_t* entry); // [out] empty profiling pool entry + +Release profiling pool entry: +hsa_status_t rocprofiler_pool_release( + rocprofiler_pool_entry_t* entry); // released profiling pool entry + +Iterate fetched profiling pool entries: +hsa_status_t rocprofiler_pool_iterate( + rocprofiler_pool_t* pool, // profiling pool handle + hsa_status_t (*callback)(rocprofiler_pool_entry_t* entry, void* data), + // callback + void *data); // [in/out] data passed to callback + +Flush completed entries in profiling pool: +hsa_status_t rocprofiler_pool_flush( + rocprofiler_pool_t* pool); // profiling pool handle +``` +## 5. Application code examples +### 5.1. Querying available metrics +``` +Info data callback: + + hsa_status_t info_data_callback(const rocprofiler_info_data_t info, void *data) { + switch (info.kind) { + case ROCPROFILER_INFO_KIND_METRIC: { + if (info.metric.expr != NULL) { + fprintf(stdout, "Derived counter: gpu-agent%d : %s : %s\n", + info.agent_index, info.metric.name, info.metric.description); + fprintf(stdout, " %s = %s\n", info.metric.name, info.metric.expr); + } else { + fprintf(stdout, "Basic counter: gpu-agent%d : %s", + info.agent_index, info.metric.name); + if (info.metric.instances > 1) { + fprintf(stdout, "[0-%u]", info.metric.instances - 1); + } + fprintf(stdout, " : %s\n", info.metric.description); + fprintf(stdout, " block %s has %u counters\n", + info.metric.block_name, info.metric.block_counters); + } + fflush(stdout); + break; + } + default: + printf("wrong info kind %u\n", kind); + return HSA_STATUS_ERROR; + } + return HSA_STATUS_SUCCESS; + } + +Printing all available metrics: + + hsa_status_t status = rocprofiler_iterate_info( + agent, + ROCPROFILER_INFO_KIND_METRIC, + info_data_callback, + NULL); + +``` +### 5.2. Profiling code example +``` +Profiling of L1 miss ratio, average memory bandwidth. +In the example below rocprofiler_group_get_data group APIs are used for the purpose of a usage +example but in SINGLEGROUP mode when only one group is allowed the context handle itself can be +saved and then direct context method rocprofiler_get_data with default group index equal to 0 +can be used. + +hsa_status_t dispatch_callback( + const rocprofiler_callback_data_t* callback_data, + void* user_data, + rocprofiler_group_t* group) +{ + hsa_status_t status = HSA_STATUS_SUCCESS; + // Profiling context + rocprofiler_t* context; + // Profiling info objects + rocprofiler_feature_t features* = new rocprofiler_feature_t[2]; + // Tracing parameters + rocprofiler_feature_parameter_t* parameters = new rocprofiler_feature_parameter_t[2]; + + // Setting profiling features + features[0].type = ROCPROFILER_METRIC; + features[0].name = "L1_MISS_RATIO"; + features[1].type = ROCPROFILER_METRIC; + features[1].name = "DRAM_BANDWIDTH"; + + // Creating profiling context + status = rocprofiler_open(callback_data->dispatch.agent, features, 2, &context, + ROCPROFILER_MODE_SINGLEGROUP, NULL); + + + // Get the profiling group + // For general case with many groups there is rocprofiler_group_count() API + const uint32_t group_index = 0 + status = rocprofiler_get_group(context, group_index, group); + + + // In SINGLEGROUP mode the context handle itself can be saved, because there is just one group + + + return status; +} + +Profiling tool constructor is adding the dispatch callback: + +void profiling_libary_constructor() { + // Defining callback data, no data in this simple example + void* callback_data = NULL; + + // Adding observers + hsa_sttaus_t status = rocprofiler_add_dispatch_callback(dispatch_callback, callback_data); + + + // Dispatching profiled kernel + +} + +void profiling_libary_destructor() { + > { + // In SINGLEGROUP mode the rocprofiler_get_group() method with default zero group + // index can be used, if context handle would be saved + status = rocprofiler_group_get_data(entry->group); + + status = rocprofiler_get_metrics(entry->group->context); + + status = rocprofiler_close(entry->group->context); + + + dispatch_data, entry->features, entry->features_count)>; + } +} +``` +### 5.3. Option to use completion callback +``` +Creating profiling context with completion callback: + . . . + rocprofiler_properties_t properties = {}; + properties.callback = completion_callback; + properties.callback_arg = NULL; // no args defined + status = rocprofiler_open(agent, features, 3, &context, + ROCPROFILER_MODE_SINGLEGROUP, properties); + + . . . + +Definition of completion callback: + +void completion_callback(profiler_group_t group, void* arg) { + + hsa_status_t status = rocprofiler_close(group.context); + +} +``` +### 5.4. Option to Use Context Pool +``` +Code example of context pool usage. +Creating profiling contexts pool: + . . . + rocprofiler_pool_properties_t properties{}; + properties.num_entries = 100; + properties.payload_bytes = sizeof(context_entry_t); + properties.handler = context_handler; + properties.handler_arg = handler_arg; + status = rocprofiler_pool_open(agent, features, 3, &context, + ROCPROFILER_MODE_SINGLEGROUP, properties); + + . . . + +Fetching a context entry: + rocprofiler_pool_entry_t pool_entry{}; + status = rocprofiler_pool_fetch(pool, &pool_entry); + + // Profiling context entry + rocprofiler_t* context = pool_entry.context; + context_entry_t* entry = reinterpret_cast + (pool_entry.payload); +``` +### 5.5. Standalone Sampling Usage Code Example +``` +The profiling metrics are being read from separate standalone queue other than the application kernels are submitted to. +To enable the sampling mode, the profiling mode in all user queues should be enabled. It can be done by loading ROC-profiler +library to HSA runtime using the environment variable HSA_TOOLS_LIB for all shell sessions. + // Sampling rate + uint32_t sampling_rate = ; + // Sampling count + uint32_t sampling_count = ; + // HSA status + hsa_status_t status = HSA_STATUS_ERROR; + // HSA agent + hsa_agent_t agent; + // Profiling context + rocprofiler_t* context = NULL; + // Profiling properties + rocprofiler_properties_t properties; + + // Getting HSA agent + + + // Profiling feature objects + const unsigned feature_count = 2; + rocprofiler_feature_t feature[feature_count]; + + // Counters and metrics + feature[0].kind = ROCPROFILER_FEATURE_KIND_METRIC; + feature[0].name = "GPUBusy"; + feature[1].kind = ROCPROFILER_FEATURE_KIND_METRIC; + feature[1].name = "SQ_WAVES"; + + // Creating profiling context with standalone queue + properties = {}; + properties.queue_depth = 128; + status = rocprofiler_open(agent, feature, feature_count, &context, + ROCPROFILER_MODE_STANDALONE| ROCPROFILER_MODE_CREATEQUEUE| + ROCPROFILER_MODE_SINGLEGROUP, &properties); + + + // Start counters and sample them in the loop with the sampling rate + status = rocprofiler_start(context, 0); + + + for (unsigned ind = 0; ind < sampling_count; ++ind) { + sleep(sampling_rate); + status = rocprofiler_read(context, 0); + + status = rocprofiler_get_data(context, 0); + + status = rocprofiler_get_metrics(context); + + print_results(feature, feature_count); + } + + // Stop counters + status = rocprofiler_stop(context, group_n); + + + // Finishing cleanup + // Deleting profiling context will delete all allocated resources + status = rocprofiler_close(context); + +``` +### 5.6. Printing Out Profiling Results +``` +Below is a code example for printing out the profiling results from profiling features array: +void print_results(rocprofiler_feature_t* feature, uint32_t feature_count) { + for (rocprofiler_feature_t* p = feature; p < feature + feature_count; ++p) + { + std::cout << (p - feature) << ": " << p->name; + switch (p->data.kind) { + case ROCPROFILER_DATA_KIND_INT64: + std::cout << " result_int64 (" << p->data.result_int64 << ")" + << std::endl; + break; + + case ROCPROFILER_DATA_KIND_BYTES: { + std::cout << " result_bytes ptr(" << p->data.result_bytes.ptr << + ") " << " size(" << p->data.result_bytes.size << ")" + << " instance_count(" << p->data.result_bytes.instance_count + << ")"; + break; + } + default: + std::cout << "bad result kind (" << p->data.kind << ")" + << std::endl; + + } + } +} +``` From 4650aa69b8bcc22ff4ea0206968e89b5c9f3ddb9 Mon Sep 17 00:00:00 2001 From: Chun Yang Date: Thu, 17 Mar 2022 00:12:12 -0700 Subject: [PATCH 150/153] SWDEV-324379 : Expose FP64 and FP32 performance counters on on AMD profilers for MI200 Change-Id: I2c38ccc297872dfc1896314ceadbed98dc761766 (cherry picked from commit 26c479c72a585e16b9cb34f8d4dd8a9cc2bad8a9) --- test/tool/gfx_metrics.xml | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/test/tool/gfx_metrics.xml b/test/tool/gfx_metrics.xml index c2a79af2..8da94414 100644 --- a/test/tool/gfx_metrics.xml +++ b/test/tool/gfx_metrics.xml @@ -103,6 +103,26 @@ + + + + + + + + + + + + + + + + + + + + From e6b8a3baf2db2ff9f0bdef21cff45056fcf23641 Mon Sep 17 00:00:00 2001 From: Saurabh Verma Date: Mon, 16 May 2022 15:40:44 -0500 Subject: [PATCH 151/153] SWDEV-298750:Approval to make internal profile counters public Added approved HW counters for MI200. Also added derived metrics for the same Change-Id: I1c6abfdfde4e4fd4ba8bd5eec0557ad08fd71c77 --- test/tool/gfx_metrics.xml | 235 +++++++++++++++++++++++++++++++++++++- test/tool/metrics.xml | 118 ++++++++++++++++++- 2 files changed, 348 insertions(+), 5 deletions(-) diff --git a/test/tool/gfx_metrics.xml b/test/tool/gfx_metrics.xml index 8da94414..9380eb4f 100644 --- a/test/tool/gfx_metrics.xml +++ b/test/tool/gfx_metrics.xml @@ -102,7 +102,70 @@ - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + @@ -118,23 +181,187 @@ + + + + + + + - - - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/test/tool/metrics.xml b/test/tool/metrics.xml index a920ff04..2f8e10dd 100644 --- a/test/tool/metrics.xml +++ b/test/tool/metrics.xml @@ -65,7 +65,123 @@ - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + From 4fccecb090f1205440b3a12d71acbfdc9a70c7fb Mon Sep 17 00:00:00 2001 From: Kiumars Sabeti Date: Fri, 15 Apr 2022 04:38:04 +0000 Subject: [PATCH 152/153] SWDEV-320429: wrapping the comma-containing names in the .csv in double quotes at the time the .csv is generated Change-Id: I62f94a1cf8895eb324080f8aacac3f13c02d7050 (cherry picked from commit 4d99f8d8e56fc5d615ba4eea439e32f0dd8466dc) --- bin/sqlitedb.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/bin/sqlitedb.py b/bin/sqlitedb.py index 7aadd257..d1584e54 100644 --- a/bin/sqlitedb.py +++ b/bin/sqlitedb.py @@ -129,6 +129,11 @@ def dump_csv(self, table_name, file_name): with open(file_name, mode='w') as fd: fd.write(','.join(fields) + '\n') for raw in self._get_raws(table_name): + tmp = list(raw) + for idx in range(len(tmp)): + if type(tmp[idx]) == str: + if(not(tmp[idx][0] == tmp[idx][-1] == '"')): tmp[idx] = '"'+tmp[idx]+'"' + raw = tuple(tmp) fd.write(reduce(lambda a, b: str(a) + ',' + str(b), raw) + '\n') # dump JSON trace From 39956a963cfe2f17503790db21088ed03e4369da Mon Sep 17 00:00:00 2001 From: "DeWitt, Stephen" Date: Wed, 14 Sep 2022 09:57:05 -0400 Subject: [PATCH 153/153] typo fix in the help output --- bin/rpl_run.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bin/rpl_run.sh b/bin/rpl_run.sh index ef5a0bda..84a56627 100755 --- a/bin/rpl_run.sh +++ b/bin/rpl_run.sh @@ -186,7 +186,7 @@ usage() { echo " Supported time formats: " echo " --flush-rate - to enable trace flush rate (time period)" echo " Supported time formats: " - echo " --parallel-kernels - to enable cnocurrent kernels" + echo " --parallel-kernels - to enable concurrent kernels" echo "" echo "Configuration file:" echo " You can set your parameters defaults preferences in the configuration file 'rpl_rc.xml'. The search path sequence: .:${HOME}:"