[Enhancement] xdlops NCHW support by transpose (#1247)

* implement set/get attribute API, and add MIOPEN_CONVOLUTION_ATTRIB_FP16_ALT_IMPL to control MIOPEN_DEBUG_FP16_ALT_IMP attribute * get attribute in asm igemm nhwc solver, and conditionally set symbol based on attribute MIOPEN_CONVOLUTION_ATTRIB_FP16_ALT_IMPL value * gfx90a_fp16_alt_impl(01) Add constness to the API. Allow resetting the attribute. Some error handling. Comments. * gfx90a_fp16_alt_impl(02) WrW: Pass ALT attribute via InvokeParams. ConvAsmImplicitGemmGTCDynamicWrwXdlopsNHWC: Update Solver and Invokers. * gfx90a_fp16_alt_impl(03) Accelerate access to attribute. Error handling. MIOPEN_DEBUG_FP16_ALT_IMP -> MIOPEN_DEBUG_CONVOLUTION_ATTRIB_FP16_ALT_IMPL. * gfx90a_fp16_alt_impl(10) [clang-tidy] Disable altera-unroll-loops (ROCm 4.5). Sort list of disabled warnings. * fix ostringstream constructor with string problem, by adding eta to 2nd arg * add batched transpose gpu kernel aim to serve nchw<->nhwc convert Co-authored-by: Artem Tamazov <[email protected]>
ROCm · Nov 4, 2021 · f02d959 · f02d959
1 parent c0636c7
commit f02d959
Show file tree

Hide file tree

Showing 16 changed files with 4,211 additions and 197 deletions.
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
@@ -186,6 +186,9 @@ set( MIOpen_Source
     include/miopen/reduce_common.hpp
     include/miopen/sequences.hpp
     include/miopen/rocm_features.hpp
+    include/miopen/batched_transpose_sol.hpp
+    include/miopen/magic_div.hpp
+    include/miopen/util_sol.hpp
     md_graph.cpp
     mdg_expr.cpp
     conv/invokers/gcn_asm_1x1u.cpp
@@ -279,6 +282,7 @@ if( MIOPEN_BACKEND MATCHES "OpenCL" OR MIOPEN_BACKEND STREQUAL "HIPOC" OR MIOPEN
     file(GLOB_RECURSE COMPOSABLE_KERNEL_DYNAMIC_CPP_SOURCE "kernels/dynamic_igemm/*.cpp")
     file(GLOB_RECURSE GPU_REFERENCE_KERNEL_HIP "kernels/gpu_reference_kernel/*.cpp")
     file(GLOB_RECURSE GPU_REFERENCE_KERNEL_ASM "kernels/gpu_reference_kernel/*.s")
+    file(GLOB_RECURSE GPU_BATCHED_TRANSPOSE_KERNEL_HIP "kernels/gpu_batched_transpose_kernel/*.cpp")
 
     set(MIOPEN_KERNEL_INCLUDES
         ${STATIC_COMPOSABLE_KERNEL_INCLUDE}
@@ -379,6 +383,7 @@ if( MIOPEN_BACKEND MATCHES "OpenCL" OR MIOPEN_BACKEND STREQUAL "HIPOC" OR MIOPEN
         ${COMPOSABLE_KERNEL_DYNAMIC_CPP_SOURCE}
         ${GPU_REFERENCE_KERNEL_HIP}
         ${GPU_REFERENCE_KERNEL_ASM}
+        ${GPU_BATCHED_TRANSPOSE_KERNEL_HIP}
         kernels/detect_llvm_amdgcn_buffer_atomic_fadd_f32_float.cpp
         kernels/MIOpenCheckNumerics.cl
         kernels/MIOpenBatchNormActivBwdPerAct.cl
@@ -503,6 +508,7 @@ if( MIOPEN_BACKEND MATCHES "OpenCL" OR MIOPEN_BACKEND STREQUAL "HIPOC" OR MIOPEN
         ocl/gcn_asm_utils.cpp
         ocl/rnn_util_ocl.cpp
         hip/hip_build_utils.cpp
+        hip/batched_transpose_sol.cpp
         pooling.cpp
         ocl/fusionopconvocl.cpp
         ocl/fusionopbiasbnactivocl.cpp

diff --git a/src/conv/invokers/impl_gemm_dynamic.cpp b/src/conv/invokers/impl_gemm_dynamic.cpp