Skip to content

Commit

Permalink
[ROCm] Update ck to use ck_tile (#21030)
Browse files Browse the repository at this point in the history
  • Loading branch information
cloudhan authored Jun 19, 2024
1 parent 5a0e523 commit ddd4ce3
Show file tree
Hide file tree
Showing 16 changed files with 224 additions and 41 deletions.
2 changes: 1 addition & 1 deletion cgmanifests/generated/cgmanifest.json
Original file line number Diff line number Diff line change
Expand Up @@ -336,7 +336,7 @@
"component": {
"type": "git",
"git": {
"commitHash": "5356c4a943a35e74d7cdc69486afcb8703b9a59a",
"commitHash": "204da9c522cebec5220bba52cd3542ebcaf99e7a",
"repositoryUrl": "https://github.com/ROCmSoftwarePlatform/composable_kernel.git"
},
"comments": "composable_kernel"
Expand Down
6 changes: 6 additions & 0 deletions cmake/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -241,6 +241,7 @@ option(onnxruntime_ENABLE_TRITON "Enable Triton" OFF)

# composable kernel is managed automatically, unless user want to explicitly disable it, it should not be manually set
option(onnxruntime_USE_COMPOSABLE_KERNEL "Enable composable kernel for ROCm EP" ON)
option(onnxruntime_USE_COMPOSABLE_KERNEL_CK_TILE "Enable ck_tile for composable kernel" ON)
option(onnxruntime_USE_ROCBLAS_EXTENSION_API "Enable rocblas tuning for ROCm EP" OFF)
option(onnxruntime_USE_TRITON_KERNEL "Enable triton compiled kernel" OFF)
option(onnxruntime_BUILD_KERNEL_EXPLORER "Build Kernel Explorer for testing and profiling GPU kernels" OFF)
Expand Down Expand Up @@ -367,6 +368,11 @@ if (onnxruntime_USE_ROCM)
if (onnxruntime_USE_COMPOSABLE_KERNEL AND ROCM_VERSION_DEV VERSION_LESS "5.3")
message(WARNING "composable kernel is only supported on ROCm >= 5.3")
set(onnxruntime_USE_COMPOSABLE_KERNEL OFF)
set(onnxruntime_USE_COMPOSABLE_KERNEL_CK_TILE OFF)
endif()
if (onnxruntime_USE_COMPOSABLE_KERNEL_CK_TILE AND ROCM_VERSION_DEV VERSION_LESS "6.0")
message(WARNING "ck_tile can only be enabled on ROCm >= 6.0 due to compatibility and compilation speed, disable automatically")
set(onnxruntime_USE_COMPOSABLE_KERNEL_CK_TILE OFF)
endif()
endif()

Expand Down
2 changes: 1 addition & 1 deletion cmake/deps.txt
Original file line number Diff line number Diff line change
Expand Up @@ -56,5 +56,5 @@ tensorboard;https://github.com/tensorflow/tensorboard/archive/373eb09e4c5d2b3cc2
cutlass;https://github.com/NVIDIA/cutlass/archive/refs/tags/v3.5.0.zip;ae038931b9fc2c416c17d9cda91d9706b343f56d
utf8_range;https://github.com/protocolbuffers/utf8_range/archive/72c943dea2b9240cd09efde15191e144bc7c7d38.zip;9925739c9debc0efa2adcb194d371a35b6a03156
extensions;https://github.com/microsoft/onnxruntime-extensions/archive/94142d8391c9791ec71c38336436319a2d4ac7a0.zip;4365ac5140338b4cb75a39944a4be276e3829b3c
composable_kernel;https://github.com/ROCmSoftwarePlatform/composable_kernel/archive/5356c4a943a35e74d7cdc69486afcb8703b9a59a.zip;522382c2af437e09124287e5879ab64af5b2e299
composable_kernel;https://github.com/ROCmSoftwarePlatform/composable_kernel/archive/204da9c522cebec5220bba52cd3542ebcaf99e7a.zip;1827348efd47831c13074245274d41b7cae8a557
directx_headers;https://github.com/microsoft/DirectX-Headers/archive/refs/tags/v1.613.1.zip;47653509a3371eabb156360f42faf582f314bf2e
38 changes: 38 additions & 0 deletions cmake/external/composable_kernel.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ FetchContent_Declare(composable_kernel
FetchContent_GetProperties(composable_kernel)
if(NOT composable_kernel_POPULATED)
FetchContent_Populate(composable_kernel)
set(GPU_TARGETS ${CMAKE_HIP_ARCHITECTURES})
set(BUILD_DEV OFF CACHE BOOL "Disable -Weverything, otherwise, error: 'constexpr' specifier is incompatible with C++98 [-Werror,-Wc++98-compat]" FORCE)
# Exclude i8 device gemm instances due to excessive long compilation time and not being used
set(DTYPES fp32 fp16 bf16 fp8)
Expand All @@ -22,4 +23,41 @@ if(NOT composable_kernel_POPULATED)
${composable_kernel_BINARY_DIR}/include
${composable_kernel_SOURCE_DIR}/library/include)
target_compile_definitions(onnxruntime_composable_kernel_includes INTERFACE __fp32__ __fp16__ __bf16__)

execute_process(
COMMAND ${Python3_EXECUTABLE} ${composable_kernel_SOURCE_DIR}/example/ck_tile/01_fmha/generate.py
--list_blobs ${composable_kernel_BINARY_DIR}/blob_list.txt
COMMAND_ERROR_IS_FATAL ANY
)
file(STRINGS ${composable_kernel_BINARY_DIR}/blob_list.txt generated_fmha_srcs)
add_custom_command(
OUTPUT ${generated_fmha_srcs}
COMMAND ${Python3_EXECUTABLE} ${composable_kernel_SOURCE_DIR}/example/ck_tile/01_fmha/generate.py --output_dir ${composable_kernel_BINARY_DIR}
DEPENDS ${composable_kernel_SOURCE_DIR}/example/ck_tile/01_fmha/generate.py ${composable_kernel_BINARY_DIR}/blob_list.txt
)
set_source_files_properties(${generated_fmha_srcs} PROPERTIES LANGUAGE HIP GENERATED TRUE)
add_custom_target(gen_fmha_srcs DEPENDS ${generated_fmha_srcs}) # dummy target for dependencies
# code generation complete

set(fmha_srcs
${generated_fmha_srcs}
${composable_kernel_SOURCE_DIR}/example/ck_tile/01_fmha/fmha_fwd.cpp
${composable_kernel_SOURCE_DIR}/example/ck_tile/01_fmha/fmha_fwd.hpp
${composable_kernel_SOURCE_DIR}/example/ck_tile/01_fmha/bias.hpp
${composable_kernel_SOURCE_DIR}/example/ck_tile/01_fmha/mask.hpp
)
add_library(onnxruntime_composable_kernel_fmha STATIC EXCLUDE_FROM_ALL ${generated_fmha_srcs})
target_link_libraries(onnxruntime_composable_kernel_fmha PUBLIC onnxruntime_composable_kernel_includes)
target_include_directories(onnxruntime_composable_kernel_fmha PUBLIC ${composable_kernel_SOURCE_DIR}/example/ck_tile/01_fmha)
add_dependencies(onnxruntime_composable_kernel_fmha gen_fmha_srcs)

# ck tile only supports MI200+ GPUs at the moment
get_target_property(archs onnxruntime_composable_kernel_fmha HIP_ARCHITECTURES)
string(REPLACE "," ";" archs "${archs}")
set(original_archs ${archs})
list(FILTER archs INCLUDE REGEX "(gfx942|gfx90a)")
if (NOT original_archs EQUAL archs)
message(WARNING "ck tile only supports archs: ${archs} among the originally specified ${original_archs}")
endif()
set_target_properties(onnxruntime_composable_kernel_fmha PROPERTIES HIP_ARCHITECTURES "${archs}")
endif()
3 changes: 3 additions & 0 deletions cmake/onnxruntime_kernel_explorer.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,9 @@ elseif (onnxruntime_USE_ROCM)
target_compile_definitions(kernel_explorer PRIVATE __HIP_PLATFORM_AMD__=1 __HIP_PLATFORM_HCC__=1)
if (onnxruntime_USE_COMPOSABLE_KERNEL)
target_compile_definitions(kernel_explorer PRIVATE USE_COMPOSABLE_KERNEL)
if (onnxruntime_USE_COMPOSABLE_KERNEL_CK_TILE)
target_compile_definitions(kernel_explorer PRIVATE USE_COMPOSABLE_KERNEL_CK_TILE)
endif()
target_link_libraries(kernel_explorer PRIVATE onnxruntime_composable_kernel_includes)
endif()
if (onnxruntime_USE_TRITON_KERNEL)
Expand Down
4 changes: 4 additions & 0 deletions cmake/onnxruntime_providers_rocm.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -204,6 +204,10 @@
device_softmax_instance
)
target_compile_definitions(onnxruntime_providers_rocm PRIVATE USE_COMPOSABLE_KERNEL)
if (onnxruntime_USE_COMPOSABLE_KERNEL_CK_TILE)
target_link_libraries(onnxruntime_providers_rocm PUBLIC onnxruntime_composable_kernel_fmha)
target_compile_definitions(onnxruntime_providers_rocm PRIVATE USE_COMPOSABLE_KERNEL_CK_TILE)
endif()
endif()

if(UNIX)
Expand Down
3 changes: 3 additions & 0 deletions cmake/onnxruntime_unittests.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -923,6 +923,9 @@ endif()
if (onnxruntime_USE_ROCM)
if (onnxruntime_USE_COMPOSABLE_KERNEL)
target_compile_definitions(onnxruntime_test_all PRIVATE USE_COMPOSABLE_KERNEL)
if (onnxruntime_USE_COMPOSABLE_KERNEL_CK_TILE)
target_compile_definitions(onnxruntime_test_all PRIVATE USE_COMPOSABLE_KERNEL_CK_TILE)
endif()
endif()
target_compile_options(onnxruntime_test_all PRIVATE -D__HIP_PLATFORM_AMD__=1 -D__HIP_PLATFORM_HCC__=1)
target_include_directories(onnxruntime_test_all PRIVATE ${onnxruntime_ROCM_HOME}/hipfft/include ${onnxruntime_ROCM_HOME}/include ${onnxruntime_ROCM_HOME}/hiprand/include ${onnxruntime_ROCM_HOME}/rocrand/include ${CMAKE_CURRENT_BINARY_DIR}/amdgpu/onnxruntime ${CMAKE_CURRENT_BINARY_DIR}/amdgpu/orttraining)
Expand Down
148 changes: 128 additions & 20 deletions cmake/patches/composable_kernel/Fix_Clang_Build.patch
Original file line number Diff line number Diff line change
@@ -1,17 +1,21 @@
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 04674124c..12e8b8b00 100644
index c23746e7f..bc326c8b5 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -19,7 +19,7 @@ endif()
@@ -23,10 +23,10 @@ endif()

set(version 1.1.0)
# Check support for CUDA/HIP in Cmake
-project(composable_kernel VERSION ${version})
-project(composable_kernel VERSION ${version} LANGUAGES CXX)
+project(composable_kernel VERSION ${version} LANGUAGES CXX HIP)
include(CTest)

-find_package(Python3 3.6 COMPONENTS Interpreter REQUIRED)
+find_package(Python3 COMPONENTS Interpreter REQUIRED)

list(APPEND CMAKE_MODULE_PATH "${PROJECT_SOURCE_DIR}/cmake")

@@ -173,27 +173,6 @@ set(CMAKE_CXX_STANDARD_REQUIRED ON)
@@ -227,27 +227,6 @@ set(CMAKE_CXX_STANDARD_REQUIRED ON)
set(CMAKE_CXX_EXTENSIONS OFF)
message("CMAKE_CXX_COMPILER_ID: ${CMAKE_CXX_COMPILER_ID}")

Expand Down Expand Up @@ -39,27 +43,20 @@ index 04674124c..12e8b8b00 100644
## HIP
find_package(HIP REQUIRED)
# Override HIP version in config.h, if necessary.
@@ -215,8 +194,6 @@ if( DEFINED CK_OVERRIDE_HIP_VERSION_PATCH )
@@ -269,12 +248,6 @@ if( DEFINED CK_OVERRIDE_HIP_VERSION_PATCH )
message(STATUS "CK_HIP_VERSION_PATCH overriden with ${CK_OVERRIDE_HIP_VERSION_PATCH}")
endif()
message(STATUS "Build with HIP ${HIP_VERSION}")
-link_libraries(hip::device)
-add_compile_definitions(__HIP_PLATFORM_HCC__=1)
-if(CK_hip_VERSION VERSION_GREATER_EQUAL 6.0.23494)
- add_compile_definitions(__HIP_PLATFORM_AMD__=1)
-else()
- add_compile_definitions(__HIP_PLATFORM_HCC__=1)
-endif()

## tidy
include(EnableCompilerWarnings)
@@ -376,7 +353,9 @@ if(BUILD_DEV)
add_compile_options(-Werror -Weverything)
endif()
#add flags to reduce the size of binaries
-add_compile_options(-Oz -flto=thin)
+# -flto requires ORT to use a linker that support LTO and -flto flag shoud be passed to linker together.
+# add_compile_options(-Oz -flto=thin)
+add_compile_options(-Oz)
message("CMAKE_CXX_FLAGS: ${CMAKE_CXX_FLAGS}")

add_custom_target(check COMMAND ${CMAKE_CTEST_COMMAND} --output-on-failure -C ${CMAKE_CFG_INTDIR})
@@ -482,11 +461,3 @@ rocm_install(FILES
@@ -541,11 +514,3 @@ rocm_install(FILES

set(CPACK_RESOURCE_FILE_LICENSE "${CMAKE_CURRENT_SOURCE_DIR}/LICENSE")
set(CPACK_RPM_PACKAGE_LICENSE "MIT")
Expand All @@ -71,11 +68,122 @@ index 04674124c..12e8b8b00 100644
- LDCONFIG
- HEADER_ONLY
-)
diff --git a/example/ck_tile/01_fmha/generate.py b/example/ck_tile/01_fmha/generate.py
index 51fecd07b..5ed371995 100644
--- a/example/ck_tile/01_fmha/generate.py
+++ b/example/ck_tile/01_fmha/generate.py
@@ -566,7 +566,7 @@ def write_blobs(output_dir : Optional[str], kernel_filter : Optional[str], recei
def list_blobs(output_file : Optional[str], kernel_filter : Optional[str], receipt, mask_impl) -> None:
assert output_file is not None
file_path = Path(output_file)
- with file_path.open('a') as f:
+ with file_path.open('w') as f:
_, kernels = get_blobs(kernel_filter, receipt, mask_impl)
for kernel in kernels:
f.write(str(file_path.parent / GEN_DIR / kernel.filename) + "\n")
diff --git a/include/ck/host_utility/hip_check_error.hpp b/include/ck/host_utility/hip_check_error.hpp
index c0894f1d7..559481fee 100644
--- a/include/ck/host_utility/hip_check_error.hpp
+++ b/include/ck/host_utility/hip_check_error.hpp
@@ -6,19 +6,7 @@
#include <sstream>
#include <hip/hip_runtime.h>

-// To be removed, which really does not tell the location of failed HIP functional call
-inline void hip_check_error(hipError_t x)
-{
- if(x != hipSuccess)
- {
- std::ostringstream ss;
- ss << "HIP runtime error: " << hipGetErrorString(x) << ". "
- << "hip_check_error.hpp"
- << ": " << __LINE__ << "in function: " << __func__;
- throw std::runtime_error(ss.str());
- }
-}
-
+#ifndef HIP_CHECK_ERROR
#define HIP_CHECK_ERROR(retval_or_funcall) \
do \
{ \
@@ -32,3 +20,9 @@ inline void hip_check_error(hipError_t x)
throw std::runtime_error(ostr.str()); \
} \
} while(0)
+#endif
+
+#ifndef hip_check_error
+#define hip_check_error HIP_CHECK_ERROR
+#endif
+
diff --git a/include/ck_tile/core/utility/transpose_vectors.hpp b/include/ck_tile/core/utility/transpose_vectors.hpp
index a164c3f94..293ead89a 100644
--- a/include/ck_tile/core/utility/transpose_vectors.hpp
+++ b/include/ck_tile/core/utility/transpose_vectors.hpp
@@ -11,6 +11,9 @@

namespace ck_tile {

+template <typename... Ts>
+constexpr bool always_false = false;
+
// S: scalar type (or it can be non-scalar type)
// NX: # of vector before transpose
// NY: # of vector after transpose
@@ -117,9 +120,11 @@ struct transpose_vectors
}
else
{
- static_assert(false, "not implemented");
+ static_assert(always_false<S_, number<NX>, number<NY>>, "not implemented");
}
}
};

+
} // namespace ck_tile
+
diff --git a/include/ck_tile/host/hip_check_error.hpp b/include/ck_tile/host/hip_check_error.hpp
index 3acdb4d87..cc26e184f 100644
--- a/include/ck_tile/host/hip_check_error.hpp
+++ b/include/ck_tile/host/hip_check_error.hpp
@@ -8,20 +8,7 @@
#include <stdexcept>
#include <hip/hip_runtime.h>

-namespace ck_tile {
-// To be removed, which really does not tell the location of failed HIP functional call
-CK_TILE_HOST void hip_check_error(hipError_t x)
-{
- if(x != hipSuccess)
- {
- std::ostringstream ss;
- ss << "HIP runtime error: " << hipGetErrorString(x) << ". " << __FILE__ << ": " << __LINE__
- << "in function: " << __func__;
- throw std::runtime_error(ss.str());
- }
-}
-} // namespace ck_tile
-
+#ifndef HIP_CHECK_ERROR
#define HIP_CHECK_ERROR(retval_or_funcall) \
do \
{ \
@@ -34,3 +21,9 @@ CK_TILE_HOST void hip_check_error(hipError_t x)
throw std::runtime_error(ostr.str()); \
} \
} while(0)
+#endif
+
+#ifndef hip_check_error
+#define hip_check_error HIP_CHECK_ERROR
+#endif
+
diff --git a/library/src/tensor_operation_instance/gpu/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/CMakeLists.txt
index 9cb5d0e9a..141a46f3d 100644
index c035e7e56..8c5f36d2e 100644
--- a/library/src/tensor_operation_instance/gpu/CMakeLists.txt
+++ b/library/src/tensor_operation_instance/gpu/CMakeLists.txt
@@ -44,8 +44,14 @@ function(add_instance_library INSTANCE_NAME)
@@ -59,8 +59,14 @@ function(add_instance_library INSTANCE_NAME)
endforeach()
#only continue if there are some source files left on the list
if(ARGN)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,9 @@ std::string RocmTuningResultsValidator::GetOrtBuildConfig() const {
std::ostringstream oss;
#ifdef USE_COMPOSABLE_KERNEL
oss << "USE_CK=" << 1 << "|";
#ifdef USE_COMPOSABLE_KERNEL_CK_TILE
oss << "USE_CKTILE=" << 1 << "|";
#endif
#else
oss << "USE_CK=" << 0 << "|";
#endif
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -174,10 +174,11 @@ stages:
- job: Linux_C_API_Packaging_ROCm_x64
workspace:
clean: all
timeoutInMinutes: 120
timeoutInMinutes: 240
pool: onnxruntime-Ubuntu2204-AMD-CPU
variables:
RocmVersion: '5.6'
RocmVersionPatchSuffix: ''
steps:
- checkout: self # due to checkout multiple repos, the root directory is $(Build.SourcesDirectory)/onnxruntime
submodules: recursive
Expand All @@ -194,7 +195,7 @@ stages:
--build-arg INSTALL_DEPS_EXTRA_ARGS=-tmur
--build-arg BUILD_UID=$(id -u)
--network=host --build-arg POLICY=manylinux_2_28 --build-arg PLATFORM=x86_64
--build-arg ROCM_VERSION=$(RocmVersion)
--build-arg ROCM_VERSION=$(RocmVersion)$(RocmVersionPatchSuffix)
--build-arg DEVTOOLSET_ROOTPATH=/opt/rh/gcc-toolset-12/root
--build-arg PREPEND_PATH=/opt/rh/gcc-toolset-12/root/usr/bin:
--build-arg LD_LIBRARY_PATH_ARG=/opt/rh/gcc-toolset-12/root/usr/lib64:/opt/rh/gcc-toolset-12/root/usr/lib:/opt/rh/gcc-toolset-12/root/usr/lib64/dyninst:/opt/rh/gcc-toolset-12/root/usr/lib/dyninst:/usr/local/lib64:/usr/local/lib
Expand Down Expand Up @@ -681,13 +682,13 @@ stages:
inputs:
targetType: 'inline'
script: |
$x64_nupkgs = (Get-ChildItem $(Build.BinariesDirectory)/nuget-artifact-x64 -Filter Microsoft.ML.OnnxRuntime.QNN*.nupkg -Recurse)
$nuget_package_name = $x64_nupkgs[0].Name
$x64_nuget_package = $x64_nupkgs[0].FullName
$nupkg_unzipped_directory = [System.IO.Path]::Combine($Env:BUILD_ARTIFACTSTAGINGDIRECTORY, 'nuget_unzip_merged', [System.IO.Path]::GetFileNameWithoutExtension($nuget_package_name))
$x64_unzip_cmd = "7z.exe x $x64_nuget_package -y -o$nupkg_unzipped_directory"
Invoke-Expression -Command $x64_unzip_cmd
Expand All @@ -703,9 +704,9 @@ stages:
}
$merged_zip = [System.IO.Path]::Combine($merged_nuget_path, 'qnn_nuget.zip')
$zip_cmd = "7z.exe a -r $merged_zip $nupkg_unzipped_directory/*"
$zip_cmd = "7z.exe a -r $merged_zip $nupkg_unzipped_directory/*"
Invoke-Expression -Command $zip_cmd
$merged_nuget = [System.IO.Path]::Combine($merged_nuget_path, $nuget_package_name)
move $merged_zip $merged_nuget
workingDirectory: $(Build.BinariesDirectory)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,8 @@ variables:
value: 109
- name: RocmVersion
value: 6.0
- name: RocmVersionPatchSuffix
value: ".3"

jobs:
- job: Linux_Build
Expand All @@ -47,7 +49,7 @@ jobs:
workspace:
clean: all
pool: onnxruntime-Ubuntu2204-AMD-CPU
timeoutInMinutes: 120
timeoutInMinutes: 240

steps:
- task: mspremier.PostBuildCleanup.PostBuildCleanup-task.PostBuildCleanup@3
Expand All @@ -63,7 +65,7 @@ jobs:
parameters:
Dockerfile: tools/ci_build/github/linux/docker/migraphx-ci-pipeline-env.Dockerfile
Context: tools/ci_build/github/linux/docker
DockerBuildArgs: "--build-arg ROCM_VERSION=$(RocmVersion)"
DockerBuildArgs: "--build-arg ROCM_VERSION=$(RocmVersion)$(RocmVersionPatchSuffix)"
Repository: onnxruntimetrainingmigraphx-cibuild-rocm$(RocmVersion)

- task: Cache@2
Expand Down Expand Up @@ -162,7 +164,7 @@ jobs:
parameters:
Dockerfile: tools/ci_build/github/linux/docker/migraphx-ci-pipeline-env.Dockerfile
Context: tools/ci_build/github/linux/docker
DockerBuildArgs: "--build-arg ROCM_VERSION=$(RocmVersion)"
DockerBuildArgs: "--build-arg ROCM_VERSION=$(RocmVersion)$(RocmVersionPatchSuffix)"
Repository: onnxruntimetrainingmigraphx-cibuild-rocm$(RocmVersion)

- task: CmdLine@2
Expand Down
Loading

0 comments on commit ddd4ce3

Please sign in to comment.