From 8d205b4458bbc5bd040f52764ed25194e23798ff Mon Sep 17 00:00:00 2001 From: JD Date: Mon, 11 Apr 2022 01:03:26 -0500 Subject: [PATCH 1/7] Update MIOpen Version to 2.17 (#1511) --- CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index f4d2abd648..18a059ace9 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -86,7 +86,7 @@ if(NOT WIN32 AND NOT APPLE) set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -s") endif() -rocm_setup_version(VERSION 2.16.0) +rocm_setup_version(VERSION 2.17.0) list( APPEND CMAKE_MODULE_PATH ${PROJECT_SOURCE_DIR}/cmake ) include(TargetFlags) From 1aa63be0c5ae053f24e0ba1efa1fb720e8e00f30 Mon Sep 17 00:00:00 2001 From: Murali N <33875998+muralinr@users.noreply.github.com> Date: Sun, 10 Apr 2022 23:05:17 -0700 Subject: [PATCH 2/7] [RNN][Op2dTensorSquash] Tensor squash kernel optimizations (#1491) --- src/kernels/MIOpenTensorKernels.cl | 139 +++++++++++++++++++++++++++-- 1 file changed, 131 insertions(+), 8 deletions(-) diff --git a/src/kernels/MIOpenTensorKernels.cl b/src/kernels/MIOpenTensorKernels.cl index f95b65a21f..f684ce0e82 100644 --- a/src/kernels/MIOpenTensorKernels.cl +++ b/src/kernels/MIOpenTensorKernels.cl @@ -635,12 +635,43 @@ __kernel void Op2dTensorSquash(const global MIOPEN_TYPE* a, const int use_bet) { MIOPEN_TYPE a_dat[RD_BLCK]; - MIOPEN_TYPE b_dat[RD_BLCK]; + MIOPEN_TYPE b_dat1[RD_BLCK]; + MIOPEN_TYPE b_dat2[RD_BLCK]; + MIOPEN_TYPE b_dat3[RD_BLCK]; + MIOPEN_TYPE b_dat4[RD_BLCK]; + MIOPEN_TYPE b_dat5[RD_BLCK]; + MIOPEN_TYPE b_dat6[RD_BLCK]; + MIOPEN_TYPE b_dat7[RD_BLCK]; + MIOPEN_TYPE b_dat8[RD_BLCK]; + MIOPEN_TYPE b_dat9[RD_BLCK]; + MIOPEN_TYPE b_dat10[RD_BLCK]; + MIOPEN_TYPE b_dat11[RD_BLCK]; + MIOPEN_TYPE b_dat12[RD_BLCK]; + MIOPEN_TYPE b_dat13[RD_BLCK]; + MIOPEN_TYPE b_dat14[RD_BLCK]; + MIOPEN_TYPE b_dat15[RD_BLCK]; + MIOPEN_TYPE b_dat16[RD_BLCK]; MIOPEN_TYPE c_dat[RD_BLCK]; + int g_RD_BLCK; for(int i = 0; i < RD_BLCK; ++i) { - b_dat[i] = (MIOPEN_TYPE)0; + b_dat1[i] = (MIOPEN_TYPE)0; + b_dat2[i] = (MIOPEN_TYPE)0; + b_dat3[i] = (MIOPEN_TYPE)0; + b_dat4[i] = (MIOPEN_TYPE)0; + b_dat5[i] = (MIOPEN_TYPE)0; + b_dat6[i] = (MIOPEN_TYPE)0; + b_dat7[i] = (MIOPEN_TYPE)0; + b_dat8[i] = (MIOPEN_TYPE)0; + b_dat9[i] = (MIOPEN_TYPE)0; + b_dat10[i] = (MIOPEN_TYPE)0; + b_dat11[i] = (MIOPEN_TYPE)0; + b_dat12[i] = (MIOPEN_TYPE)0; + b_dat13[i] = (MIOPEN_TYPE)0; + b_dat14[i] = (MIOPEN_TYPE)0; + b_dat15[i] = (MIOPEN_TYPE)0; + b_dat16[i] = (MIOPEN_TYPE)0; } for(int gid = get_global_id(0); gid < total_work; gid += get_global_size(0)) @@ -670,16 +701,108 @@ __kernel void Op2dTensorSquash(const global MIOPEN_TYPE* a, } } - for(int bid = 0; bid < b_c; bid++) + g_RD_BLCK = gid * RD_BLCK; + if(use_apl1 == 1) { - if(use_apl1 == 1) + for(int bid = 0; bid < ((b_c / 16) * 16); bid += 16) { - int b_index = bid * b_nstride + gid * RD_BLCK; - *((READ_TYPE*)b_dat) = *((const global READ_TYPE*)(b + Boffset + b_index)); + + int b_index1 = (bid * b_nstride) + g_RD_BLCK; + int b_index2 = ((bid + 1) * b_nstride) + g_RD_BLCK; + int b_index3 = ((bid + 2) * b_nstride) + g_RD_BLCK; + int b_index4 = ((bid + 3) * b_nstride) + g_RD_BLCK; + int b_index5 = ((bid + 4) * b_nstride) + g_RD_BLCK; + int b_index6 = ((bid + 5) * b_nstride) + g_RD_BLCK; + int b_index7 = ((bid + 6) * b_nstride) + g_RD_BLCK; + int b_index8 = ((bid + 7) * b_nstride) + g_RD_BLCK; + int b_index9 = ((bid + 8) * b_nstride) + g_RD_BLCK; + int b_index10 = ((bid + 9) * b_nstride) + g_RD_BLCK; + int b_index11 = ((bid + 10) * b_nstride) + g_RD_BLCK; + int b_index12 = ((bid + 11) * b_nstride) + g_RD_BLCK; + int b_index13 = ((bid + 12) * b_nstride) + g_RD_BLCK; + int b_index14 = ((bid + 13) * b_nstride) + g_RD_BLCK; + int b_index15 = ((bid + 14) * b_nstride) + g_RD_BLCK; + int b_index16 = ((bid + 15) * b_nstride) + g_RD_BLCK; + *((READ_TYPE*)b_dat1) = *((const global READ_TYPE*)(b + Boffset + b_index1)); + *((READ_TYPE*)b_dat2) = *((const global READ_TYPE*)(b + Boffset + b_index2)); + *((READ_TYPE*)b_dat3) = *((const global READ_TYPE*)(b + Boffset + b_index3)); + *((READ_TYPE*)b_dat4) = *((const global READ_TYPE*)(b + Boffset + b_index4)); + *((READ_TYPE*)b_dat5) = *((const global READ_TYPE*)(b + Boffset + b_index5)); + *((READ_TYPE*)b_dat6) = *((const global READ_TYPE*)(b + Boffset + b_index6)); + *((READ_TYPE*)b_dat7) = *((const global READ_TYPE*)(b + Boffset + b_index7)); + *((READ_TYPE*)b_dat8) = *((const global READ_TYPE*)(b + Boffset + b_index8)); + *((READ_TYPE*)b_dat9) = *((const global READ_TYPE*)(b + Boffset + b_index9)); + *((READ_TYPE*)b_dat10) = *((const global READ_TYPE*)(b + Boffset + b_index10)); + *((READ_TYPE*)b_dat11) = *((const global READ_TYPE*)(b + Boffset + b_index11)); + *((READ_TYPE*)b_dat12) = *((const global READ_TYPE*)(b + Boffset + b_index12)); + *((READ_TYPE*)b_dat13) = *((const global READ_TYPE*)(b + Boffset + b_index13)); + *((READ_TYPE*)b_dat14) = *((const global READ_TYPE*)(b + Boffset + b_index14)); + *((READ_TYPE*)b_dat15) = *((const global READ_TYPE*)(b + Boffset + b_index15)); + *((READ_TYPE*)b_dat16) = *((const global READ_TYPE*)(b + Boffset + b_index16)); + + for(int i = 0; i < RD_BLCK; ++i) + { + c_dat[i] += MIOPEN_TENSOR_OP(a_dat[i], b_dat1[i] * alpha1); + c_dat[i] += MIOPEN_TENSOR_OP(a_dat[i], b_dat2[i] * alpha1); + c_dat[i] += MIOPEN_TENSOR_OP(a_dat[i], b_dat3[i] * alpha1); + c_dat[i] += MIOPEN_TENSOR_OP(a_dat[i], b_dat4[i] * alpha1); + c_dat[i] += MIOPEN_TENSOR_OP(a_dat[i], b_dat5[i] * alpha1); + c_dat[i] += MIOPEN_TENSOR_OP(a_dat[i], b_dat6[i] * alpha1); + c_dat[i] += MIOPEN_TENSOR_OP(a_dat[i], b_dat7[i] * alpha1); + c_dat[i] += MIOPEN_TENSOR_OP(a_dat[i], b_dat8[i] * alpha1); + c_dat[i] += MIOPEN_TENSOR_OP(a_dat[i], b_dat9[i] * alpha1); + c_dat[i] += MIOPEN_TENSOR_OP(a_dat[i], b_dat10[i] * alpha1); + c_dat[i] += MIOPEN_TENSOR_OP(a_dat[i], b_dat11[i] * alpha1); + c_dat[i] += MIOPEN_TENSOR_OP(a_dat[i], b_dat12[i] * alpha1); + c_dat[i] += MIOPEN_TENSOR_OP(a_dat[i], b_dat13[i] * alpha1); + c_dat[i] += MIOPEN_TENSOR_OP(a_dat[i], b_dat14[i] * alpha1); + c_dat[i] += MIOPEN_TENSOR_OP(a_dat[i], b_dat15[i] * alpha1); + c_dat[i] += MIOPEN_TENSOR_OP(a_dat[i], b_dat16[i] * alpha1); + } } - for(int i = 0; i < RD_BLCK; ++i) + for(int bid = ((b_c / 16) * 16); bid < b_c; bid++) { - c_dat[i] += MIOPEN_TENSOR_OP(a_dat[i], b_dat[i] * alpha1); + int b_index = bid * b_nstride + g_RD_BLCK; + *((READ_TYPE*)b_dat1) = *((const global READ_TYPE*)(b + Boffset + b_index)); + + for(int i = 0; i < RD_BLCK; ++i) + { + c_dat[i] += MIOPEN_TENSOR_OP(a_dat[i], b_dat1[i] * alpha1); + } + } + } + else + { + for(int bid = 0; bid < ((b_c / 16) * 16); bid += 16) + { + + for(int i = 0; i < RD_BLCK; ++i) + { + c_dat[i] += MIOPEN_TENSOR_OP(a_dat[i], 0); + c_dat[i] += MIOPEN_TENSOR_OP(a_dat[i], 0); + c_dat[i] += MIOPEN_TENSOR_OP(a_dat[i], 0); + c_dat[i] += MIOPEN_TENSOR_OP(a_dat[i], 0); + c_dat[i] += MIOPEN_TENSOR_OP(a_dat[i], 0); + c_dat[i] += MIOPEN_TENSOR_OP(a_dat[i], 0); + c_dat[i] += MIOPEN_TENSOR_OP(a_dat[i], 0); + c_dat[i] += MIOPEN_TENSOR_OP(a_dat[i], 0); + c_dat[i] += MIOPEN_TENSOR_OP(a_dat[i], 0); + c_dat[i] += MIOPEN_TENSOR_OP(a_dat[i], 0); + c_dat[i] += MIOPEN_TENSOR_OP(a_dat[i], 0); + c_dat[i] += MIOPEN_TENSOR_OP(a_dat[i], 0); + c_dat[i] += MIOPEN_TENSOR_OP(a_dat[i], 0); + c_dat[i] += MIOPEN_TENSOR_OP(a_dat[i], 0); + c_dat[i] += MIOPEN_TENSOR_OP(a_dat[i], 0); + c_dat[i] += MIOPEN_TENSOR_OP(a_dat[i], 0); + } + } + for(int bid = ((b_c / 16) * 16); bid < b_c; bid++) + { + + for(int i = 0; i < RD_BLCK; ++i) + { + c_dat[i] += MIOPEN_TENSOR_OP(a_dat[i], 0); + } } } *((global READ_TYPE*)(c + Coffset + io_index)) = *((READ_TYPE*)c_dat); From 9ae2418adf767794e9475274a4cf90e418f00a58 Mon Sep 17 00:00:00 2001 From: arvindcheru <90783369+arvindcheru@users.noreply.github.com> Date: Mon, 11 Apr 2022 02:06:51 -0400 Subject: [PATCH 3/7] Fix Warning - Update rocBLAS re-org PATH (#1509) --- src/gemm_v2.cpp | 2 +- src/include/miopen/handle.hpp | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/gemm_v2.cpp b/src/gemm_v2.cpp index fc8b3222c0..f95337dc0f 100644 --- a/src/gemm_v2.cpp +++ b/src/gemm_v2.cpp @@ -40,7 +40,7 @@ #if MIOPEN_USE_ROCBLAS #include -#include +#include #include #endif diff --git a/src/include/miopen/handle.hpp b/src/include/miopen/handle.hpp index ee45ae2a0b..b99883f5f8 100644 --- a/src/include/miopen/handle.hpp +++ b/src/include/miopen/handle.hpp @@ -52,7 +52,7 @@ #if MIOPEN_USE_ROCBLAS #include -#include +#include #endif namespace miopen { From 0a9c2fd2f417659fc7df572e90c67d31c78ff057 Mon Sep 17 00:00:00 2001 From: Jun Liu Date: Sun, 10 Apr 2022 23:55:57 -0700 Subject: [PATCH 4/7] Revert "Fix Warning - Update rocBLAS re-org PATH (#1509)" This reverts commit 9ae2418adf767794e9475274a4cf90e418f00a58. --- src/gemm_v2.cpp | 2 +- src/include/miopen/handle.hpp | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/gemm_v2.cpp b/src/gemm_v2.cpp index f95337dc0f..fc8b3222c0 100644 --- a/src/gemm_v2.cpp +++ b/src/gemm_v2.cpp @@ -40,7 +40,7 @@ #if MIOPEN_USE_ROCBLAS #include -#include +#include #include #endif diff --git a/src/include/miopen/handle.hpp b/src/include/miopen/handle.hpp index b99883f5f8..ee45ae2a0b 100644 --- a/src/include/miopen/handle.hpp +++ b/src/include/miopen/handle.hpp @@ -52,7 +52,7 @@ #if MIOPEN_USE_ROCBLAS #include -#include +#include #endif namespace miopen { From 9aafc24dd4256371f2dc32088f599f66a0c3d4cf Mon Sep 17 00:00:00 2001 From: Zhuoran Yin Date: Mon, 11 Apr 2022 13:44:12 -0400 Subject: [PATCH 5/7] [MLIR] Querying all sub-kernels applicability (#1505) --- src/solver/conv_mlir_igemm_fwd.cpp | 10 +++++++++- src/solver/conv_mlir_igemm_fwd_xdlops.cpp | 10 +++++++++- 2 files changed, 18 insertions(+), 2 deletions(-) diff --git a/src/solver/conv_mlir_igemm_fwd.cpp b/src/solver/conv_mlir_igemm_fwd.cpp index 2c8c75c3f4..f89d430707 100644 --- a/src/solver/conv_mlir_igemm_fwd.cpp +++ b/src/solver/conv_mlir_igemm_fwd.cpp @@ -91,7 +91,15 @@ bool PerformanceConvMlirIgemm::IsValid(const ConvolutionContext& ctx) const if(*this == MlirHeuristicInitRequest()) return true; - return MiirIsConfigApplicable(mlir::ConstructBuildOptions(ctx, *this, false)); + int kernel_count = MiirGetKernelCount(mlir::ConstructBuildOptions(ctx, false)); + bool isValid = false; + for(int kernel_id = 0; kernel_id < kernel_count; ++kernel_id) + { + isValid = MiirIsConfigApplicable(mlir::ConstructBuildOptions(ctx, *this, false, kernel_id)); + if(!isValid) + return false; + } + return isValid; #else std::ignore = ctx; return false; diff --git a/src/solver/conv_mlir_igemm_fwd_xdlops.cpp b/src/solver/conv_mlir_igemm_fwd_xdlops.cpp index eb2c8d04d0..3798849f93 100644 --- a/src/solver/conv_mlir_igemm_fwd_xdlops.cpp +++ b/src/solver/conv_mlir_igemm_fwd_xdlops.cpp @@ -127,7 +127,15 @@ bool PerformanceConvMlirIgemmXdlops::IsValid(const ConvolutionContext& ctx) cons if(*this == MlirHeuristicInitRequest()) return true; - return MiirIsConfigApplicable(mlir::ConstructBuildOptions(ctx, *this, true)); + int kernel_count = MiirGetKernelCount(mlir::ConstructBuildOptions(ctx, true)); + bool isValid = false; + for(int kernel_id = 0; kernel_id < kernel_count; ++kernel_id) + { + isValid = MiirIsConfigApplicable(mlir::ConstructBuildOptions(ctx, *this, true, kernel_id)); + if(!isValid) + return false; + } + return isValid; #else std::ignore = ctx; return false; From f6575a69cdcbbf2e767b39c2a876e240625fc84e Mon Sep 17 00:00:00 2001 From: arvindcheru <90783369+arvindcheru@users.noreply.github.com> Date: Mon, 11 Apr 2022 16:22:35 -0400 Subject: [PATCH 6/7] Updated rocm cmake version requirement (#1514) --- dev-requirements.txt | 2 +- requirements.txt | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/dev-requirements.txt b/dev-requirements.txt index 160774795f..26e3737100 100755 --- a/dev-requirements.txt +++ b/dev-requirements.txt @@ -1,5 +1,5 @@ ROCmSoftwarePlatform/rocm-recipes -RadeonOpenCompute/rocm-cmake@4a1514850195360ce772182d34b6b5afd8f37253 --build +RadeonOpenCompute/rocm-cmake@bd4e360c73fa366f817f5aa013433e799d8cd659 --build -f requirements.txt # 1.90+ danmar/cppcheck@dd05839a7e63ef04afd34711cb3e1e0ef742882f diff --git a/requirements.txt b/requirements.txt index 0747183a26..c5bc5fbd18 100755 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,4 @@ -RadeonOpenCompute/rocm-cmake@4a1514850195360ce772182d34b6b5afd8f37253 --build +RadeonOpenCompute/rocm-cmake@bd4e360c73fa366f817f5aa013433e799d8cd659 --build sqlite3@3.17 -DCMAKE_POSITION_INDEPENDENT_CODE=On boost@1.72 -DCMAKE_POSITION_INDEPENDENT_CODE=On --build half,https://github.com/pfultz2/half/archive/1.12.0.tar.gz -X header -H sha256:0a08660b68abb176ebc2a0cdf8de46e3182a7f46c66443bb80dbfaaec98cf969 --build From 3d16ede87b9cc504ab5f6d9798150f3ec0197f5e Mon Sep 17 00:00:00 2001 From: Haocong WANG Date: Tue, 12 Apr 2022 04:25:36 +0800 Subject: [PATCH 7/7] [Tensor reorder][Quality][#issue 1476] Split kernel file & resolve unsolved issues (#1515) --- src/hip/general_tensor_reorder_sol.cpp | 22 +- .../miopen/general_tensor_reorder_sol.hpp | 1 + src/include/miopen/tensor_reorder_util.hpp | 38 +- .../general_tensor_reorder.cpp | 890 ------------------ .../general_tensor_reorder_16x256_byte.cpp | 49 + .../general_tensor_reorder_16x256_dword.cpp | 49 + .../general_tensor_reorder_16x256_dwordx2.cpp | 49 + .../general_tensor_reorder_16x256_half.cpp | 49 + .../general_tensor_reorder_1x256_byte.cpp | 49 + .../general_tensor_reorder_1x256_dword.cpp | 49 + .../general_tensor_reorder_1x256_dwordx2.cpp | 49 + .../general_tensor_reorder_1x256_half.cpp | 49 + .../general_tensor_reorder_2x256_byte.cpp | 49 + .../general_tensor_reorder_2x256_dword.cpp | 49 + .../general_tensor_reorder_2x256_dwordx2.cpp | 49 + .../general_tensor_reorder_2x256_half.cpp | 49 + .../general_tensor_reorder_4x256_byte.cpp | 49 + .../general_tensor_reorder_4x256_dword.cpp | 49 + .../general_tensor_reorder_4x256_dwordx2.cpp | 49 + .../general_tensor_reorder_4x256_half.cpp | 49 + .../general_tensor_reorder_8x256_byte.cpp | 49 + .../general_tensor_reorder_8x256_dword.cpp | 49 + .../general_tensor_reorder_8x256_dwordx2.cpp | 49 + .../general_tensor_reorder_8x256_half.cpp | 49 + .../general_tensor_reorder_kernel_util.hpp | 428 +++++++++ .../order.hpp | 43 - test/tensor_reorder.cpp | 46 +- 27 files changed, 1501 insertions(+), 947 deletions(-) delete mode 100644 src/kernels/gpu_general_tensor_reorder_kernel/general_tensor_reorder.cpp create mode 100644 src/kernels/gpu_general_tensor_reorder_kernel/general_tensor_reorder_16x256_byte.cpp create mode 100644 src/kernels/gpu_general_tensor_reorder_kernel/general_tensor_reorder_16x256_dword.cpp create mode 100644 src/kernels/gpu_general_tensor_reorder_kernel/general_tensor_reorder_16x256_dwordx2.cpp create mode 100644 src/kernels/gpu_general_tensor_reorder_kernel/general_tensor_reorder_16x256_half.cpp create mode 100644 src/kernels/gpu_general_tensor_reorder_kernel/general_tensor_reorder_1x256_byte.cpp create mode 100644 src/kernels/gpu_general_tensor_reorder_kernel/general_tensor_reorder_1x256_dword.cpp create mode 100644 src/kernels/gpu_general_tensor_reorder_kernel/general_tensor_reorder_1x256_dwordx2.cpp create mode 100644 src/kernels/gpu_general_tensor_reorder_kernel/general_tensor_reorder_1x256_half.cpp create mode 100644 src/kernels/gpu_general_tensor_reorder_kernel/general_tensor_reorder_2x256_byte.cpp create mode 100644 src/kernels/gpu_general_tensor_reorder_kernel/general_tensor_reorder_2x256_dword.cpp create mode 100644 src/kernels/gpu_general_tensor_reorder_kernel/general_tensor_reorder_2x256_dwordx2.cpp create mode 100644 src/kernels/gpu_general_tensor_reorder_kernel/general_tensor_reorder_2x256_half.cpp create mode 100644 src/kernels/gpu_general_tensor_reorder_kernel/general_tensor_reorder_4x256_byte.cpp create mode 100644 src/kernels/gpu_general_tensor_reorder_kernel/general_tensor_reorder_4x256_dword.cpp create mode 100644 src/kernels/gpu_general_tensor_reorder_kernel/general_tensor_reorder_4x256_dwordx2.cpp create mode 100644 src/kernels/gpu_general_tensor_reorder_kernel/general_tensor_reorder_4x256_half.cpp create mode 100644 src/kernels/gpu_general_tensor_reorder_kernel/general_tensor_reorder_8x256_byte.cpp create mode 100644 src/kernels/gpu_general_tensor_reorder_kernel/general_tensor_reorder_8x256_dword.cpp create mode 100644 src/kernels/gpu_general_tensor_reorder_kernel/general_tensor_reorder_8x256_dwordx2.cpp create mode 100644 src/kernels/gpu_general_tensor_reorder_kernel/general_tensor_reorder_8x256_half.cpp create mode 100644 src/kernels/gpu_general_tensor_reorder_kernel/general_tensor_reorder_kernel_util.hpp delete mode 100644 src/kernels/gpu_general_tensor_reorder_kernel/order.hpp diff --git a/src/hip/general_tensor_reorder_sol.cpp b/src/hip/general_tensor_reorder_sol.cpp index 2012a574a0..f78aa358f8 100644 --- a/src/hip/general_tensor_reorder_sol.cpp +++ b/src/hip/general_tensor_reorder_sol.cpp @@ -52,6 +52,17 @@ static inline std::string GetKernelNameType(std::size_t type_size) MIOPEN_THROW("data type not supported"); } +static inline std::string GetKernelFileName(std::size_t data_size, + const GeneralReorderParam* kparam) +{ + if(kparam == nullptr) + MIOPEN_THROW("Memory access fault, kparam is a nullptr"); + std::ostringstream kernel_file_name; + kernel_file_name << "general_tensor_reorder_" << kparam->tile_x << "x" << kparam->tile_y << "_"; + kernel_file_name << GetKernelNameType(data_size) << ".cpp"; + return kernel_file_name.str(); +} + static inline std::string GetKernelName(std::size_t data_size, uint32_t order_0, uint32_t order_1, @@ -140,9 +151,10 @@ solver::KernelInfo GenericReorderSolutionImpl::GetKernelInfo() const (block_size * kernel_param_heuristic.tile_x); std::size_t grid_size = dim_total; - std::string kernel_name = GetKernelName(); + std::string kernel_name = GetKernelName(); + std::string kernel_file_name = GetKernelFileName(); solver::KernelInfo kernel; - kernel.kernel_file = "general_tensor_reorder.cpp"; + kernel.kernel_file = kernel_file_name; kernel.kernel_name = kernel_name; kernel.g_wk.clear(); kernel.g_wk.push_back(grid_size * block_size); @@ -191,6 +203,12 @@ std::vector GenericReorderSolutionImpl::GetKernelArg() const return opArgs; } +std::string GenericReorderSolutionImpl::GetKernelFileName() const +{ + return tensor_reorder::GetKernelFileName(miopen::GetTypeSize(data_type), + &kernel_param_heuristic); +} + std::string GenericReorderSolutionImpl::GetKernelName() const { return tensor_reorder::GetKernelName(miopen::GetTypeSize(data_type), diff --git a/src/include/miopen/general_tensor_reorder_sol.hpp b/src/include/miopen/general_tensor_reorder_sol.hpp index 41f387f995..0df1d9cd0c 100644 --- a/src/include/miopen/general_tensor_reorder_sol.hpp +++ b/src/include/miopen/general_tensor_reorder_sol.hpp @@ -58,6 +58,7 @@ struct GenericReorderSolutionImpl // TODO batched transpose API solver::KernelInfo GetKernelInfo() const; std::vector GetKernelArg() const; + std::string GetKernelFileName() const; std::string GetKernelName() const; bool IsSkippable() const; size_t GetOutputTensorSize() const; diff --git a/src/include/miopen/tensor_reorder_util.hpp b/src/include/miopen/tensor_reorder_util.hpp index 1010b98e95..24b7b96650 100644 --- a/src/include/miopen/tensor_reorder_util.hpp +++ b/src/include/miopen/tensor_reorder_util.hpp @@ -169,26 +169,49 @@ struct GenericReorderSolution : TensorReorderAttributesBase inline std::unique_ptr MakeTensorReorderAttributes(const ExecutionContext& ctx_, miopenDataType_t data_type_, - uint32_t dim_0_, - uint32_t dim_1_, - uint32_t dim_2_, - uint32_t dim_3_, + uint64_t dim_0_, + uint64_t dim_1_, + uint64_t dim_2_, + uint64_t dim_3_, uint32_t order_0_, uint32_t order_1_, uint32_t order_2_, uint32_t order_3_) { - std::unique_ptr default_ptr; if(!ctx_.use_hip_kernels) { - return default_ptr; + return nullptr; + } + ///\todo 64-bit tensor dimension + // Currentlly we have tensor dimension limited to 2^32 - 1 + if((dim_0_ > UINT32_MAX) || (dim_1_ > UINT32_MAX) || (dim_2_ > UINT32_MAX) || + (dim_3_ > UINT32_MAX)) + { + MIOPEN_THROW("Currentlly we have tensor dimension limitation of 2^32 - 1"); } // Default using general reorder if(data_type_ == miopenBFloat16) { MIOPEN_THROW("Unsupported reorder data type"); } + // Check required reorder is legal or not + std::vector> all_possible_order{ + {0, 1, 3, 2}, {0, 2, 1, 3}, {0, 2, 3, 1}, {0, 3, 1, 2}, {0, 3, 2, 1}, {1, 0, 2, 3}, + {1, 0, 3, 2}, {1, 2, 0, 3}, {1, 2, 3, 0}, {1, 3, 0, 2}, {1, 3, 2, 0}, {2, 0, 1, 3}, + {2, 0, 3, 1}, {2, 1, 0, 3}, {2, 1, 3, 0}, {2, 3, 0, 1}, {2, 3, 1, 0}, {3, 0, 1, 2}, + {3, 0, 2, 1}, {3, 1, 0, 2}, {3, 1, 2, 0}, {3, 2, 0, 1}, {3, 2, 1, 0}}; + auto found = std::find_if( + all_possible_order.begin(), + all_possible_order.end(), + [order_0_, order_1_, order_2_, order_3_](std::vector possible_order) { + return (possible_order[0] == order_0_) && (possible_order[1] == order_1_) && + (possible_order[2] == order_2_) && (possible_order[3] == order_3_); + }); + if(found == all_possible_order.end()) + MIOPEN_THROW("Unsupported Reorder"); + int which = 0; + // Special cases that utilize batched transpose kernel if(data_type_ != miopenDouble) { if((order_0_ == 0) && (order_1_ == 1) && (order_2_ == 3) && (order_3_ == 2)) @@ -235,9 +258,8 @@ MakeTensorReorderAttributes(const ExecutionContext& ctx_, case 5: return std::make_unique( ctx_, data_type_, dim_0_, dim_1_, dim_2_, dim_3_); - default: MIOPEN_THROW("Unsupported reorder sequence"); break; + default: MIOPEN_THROW("Failed to call tensor reorder solver"); } - return default_ptr; } } // namespace miopen diff --git a/src/kernels/gpu_general_tensor_reorder_kernel/general_tensor_reorder.cpp b/src/kernels/gpu_general_tensor_reorder_kernel/general_tensor_reorder.cpp deleted file mode 100644 index e8f236c36e..0000000000 --- a/src/kernels/gpu_general_tensor_reorder_kernel/general_tensor_reorder.cpp +++ /dev/null @@ -1,890 +0,0 @@ -/******************************************************************************* - * - * MIT License - * - * Copyright (c) 2020-2022 Advanced Micro Devices, Inc. - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - *******************************************************************************/ -#ifndef MIOPEN_DONT_USE_HIP_RUNTIME_HEADERS -#include -#endif -#include -#include "order.hpp" - -#ifndef TENSOR_REORDER_OCCUPANCY -#define TENSOR_REORDER_OCCUPANCY 4 -#endif - -inline __device__ uint32_t magic_div_u32(const uint32_t& numer, - const uint32_t& magic, - const uint32_t& shift) -{ - uint32_t tmp = __umulhi(numer, magic); - return (tmp + numer) >> shift; -} - -template -inline __device__ void general_4d_reorder_1x256(T* dst, - T* src, - uint32_t dim_0, - uint32_t dim_1, - uint32_t dim_2, - uint32_t dim_3, - uint32_t dim_stride, - uint32_t dim_total, - uint32_t magic_stride0, - uint32_t shift_stride0, - uint32_t magic_stride1, - uint32_t shift_stride1, - uint32_t magic_stride2, - uint32_t shift_stride2) -{ - constexpr auto dorder = dst_order{}; - uint32_t pixel_total = dim_0 * dim_1 * dim_2 * dim_3; - uint32_t src_index, dst_index; - const uint64_t src_dim[4] = {dim_0, dim_1, dim_2, dim_3}; - const uint64_t dst_dim[4] = { - src_dim[dorder.at(0)], src_dim[dorder.at(1)], src_dim[dorder.at(2)], src_dim[dorder.at(3)]}; - const uint64_t src_stride[4] = { - src_dim[1] * src_dim[2] * src_dim[3], src_dim[2] * src_dim[3], src_dim[3], 1}; - const uint64_t dst_stride[4] = { - dst_dim[1] * dst_dim[2] * dst_dim[3], dst_dim[2] * dst_dim[3], dst_dim[3], 1}; - - uint32_t i_src[4] = {0, 0, 0, 0}; - uint32_t i_dst[4] = {0, 0, 0, 0}; - - for(uint32_t dim_id = blockIdx.x; dim_id < dim_total; dim_id += dim_stride) - { - for(uint32_t k = 0; k < 1; k++) - { - // unroll k block thread - src_index = k * dim_total * 256 + dim_id * 256 + threadIdx.x; - if(src_index < pixel_total) - { - i_src[0] = magic_div_u32(src_index, magic_stride0, shift_stride0); - i_src[1] = magic_div_u32( - src_index - i_src[0] * src_stride[0], magic_stride1, shift_stride1); - i_src[2] = - magic_div_u32(src_index - i_src[0] * src_stride[0] - i_src[1] * src_stride[1], - magic_stride2, - shift_stride2); - i_src[3] = src_index - i_src[0] * src_stride[0] - i_src[1] * src_stride[1] - - i_src[2] * src_stride[2]; - - i_dst[0] = i_src[dorder.at(0)]; - i_dst[1] = i_src[dorder.at(1)]; - i_dst[2] = i_src[dorder.at(2)]; - i_dst[3] = i_src[dorder.at(3)]; - - dst_index = i_dst[0] * dst_stride[0] + i_dst[1] * dst_stride[1] + - i_dst[2] * dst_stride[2] + i_dst[3] * dst_stride[3]; - dst[dst_index] = src[src_index]; - } - } - } -} - -template -inline __device__ void general_4d_reorder_2x256(T* dst, - T* src, - uint32_t dim_0, - uint32_t dim_1, - uint32_t dim_2, - uint32_t dim_3, - uint32_t dim_stride, - uint32_t dim_total, - uint32_t magic_stride0, - uint32_t shift_stride0, - uint32_t magic_stride1, - uint32_t shift_stride1, - uint32_t magic_stride2, - uint32_t shift_stride2) -{ - constexpr auto dorder = dst_order{}; - uint32_t pixel_total = dim_0 * dim_1 * dim_2 * dim_3; - uint32_t src_index, dst_index; - const uint64_t src_dim[4] = {dim_0, dim_1, dim_2, dim_3}; - const uint64_t dst_dim[4] = { - src_dim[dorder.at(0)], src_dim[dorder.at(1)], src_dim[dorder.at(2)], src_dim[dorder.at(3)]}; - const uint64_t src_stride[4] = { - src_dim[1] * src_dim[2] * src_dim[3], src_dim[2] * src_dim[3], src_dim[3], 1}; - const uint64_t dst_stride[4] = { - dst_dim[1] * dst_dim[2] * dst_dim[3], dst_dim[2] * dst_dim[3], dst_dim[3], 1}; - - uint32_t i_src[4] = {0, 0, 0, 0}; - uint32_t i_dst[4] = {0, 0, 0, 0}; - - for(uint32_t dim_id = blockIdx.x; dim_id < dim_total; dim_id += dim_stride) - { - for(uint32_t k = 0; k < 2; k++) - { - // unroll k block thread - src_index = k * dim_total * 256 + dim_id * 256 + threadIdx.x; - if(src_index < pixel_total) - { - i_src[0] = magic_div_u32(src_index, magic_stride0, shift_stride0); - i_src[1] = magic_div_u32( - src_index - i_src[0] * src_stride[0], magic_stride1, shift_stride1); - i_src[2] = - magic_div_u32(src_index - i_src[0] * src_stride[0] - i_src[1] * src_stride[1], - magic_stride2, - shift_stride2); - i_src[3] = src_index - i_src[0] * src_stride[0] - i_src[1] * src_stride[1] - - i_src[2] * src_stride[2]; - - i_dst[0] = i_src[dorder.at(0)]; - i_dst[1] = i_src[dorder.at(1)]; - i_dst[2] = i_src[dorder.at(2)]; - i_dst[3] = i_src[dorder.at(3)]; - - dst_index = i_dst[0] * dst_stride[0] + i_dst[1] * dst_stride[1] + - i_dst[2] * dst_stride[2] + i_dst[3] * dst_stride[3]; - dst[dst_index] = src[src_index]; - } - } - } -} - -template -inline __device__ void general_4d_reorder_4x256(T* dst, - T* src, - uint32_t dim_0, - uint32_t dim_1, - uint32_t dim_2, - uint32_t dim_3, - uint32_t dim_stride, - uint32_t dim_total, - uint32_t magic_stride0, - uint32_t shift_stride0, - uint32_t magic_stride1, - uint32_t shift_stride1, - uint32_t magic_stride2, - uint32_t shift_stride2) -{ - constexpr auto dorder = dst_order{}; - uint32_t pixel_total = dim_0 * dim_1 * dim_2 * dim_3; - uint32_t src_index, dst_index; - const uint64_t src_dim[4] = {dim_0, dim_1, dim_2, dim_3}; - const uint64_t dst_dim[4] = { - src_dim[dorder.at(0)], src_dim[dorder.at(1)], src_dim[dorder.at(2)], src_dim[dorder.at(3)]}; - const uint64_t src_stride[4] = { - src_dim[1] * src_dim[2] * src_dim[3], src_dim[2] * src_dim[3], src_dim[3], 1}; - const uint64_t dst_stride[4] = { - dst_dim[1] * dst_dim[2] * dst_dim[3], dst_dim[2] * dst_dim[3], dst_dim[3], 1}; - - uint32_t i_src[4] = {0, 0, 0, 0}; - uint32_t i_dst[4] = {0, 0, 0, 0}; - - for(uint32_t dim_id = blockIdx.x; dim_id < dim_total; dim_id += dim_stride) - { - for(uint32_t k = 0; k < 4; k++) - { - // unroll k block thread - src_index = k * dim_total * 256 + dim_id * 256 + threadIdx.x; - if(src_index < pixel_total) - { - i_src[0] = magic_div_u32(src_index, magic_stride0, shift_stride0); - i_src[1] = magic_div_u32( - src_index - i_src[0] * src_stride[0], magic_stride1, shift_stride1); - i_src[2] = - magic_div_u32(src_index - i_src[0] * src_stride[0] - i_src[1] * src_stride[1], - magic_stride2, - shift_stride2); - i_src[3] = src_index - i_src[0] * src_stride[0] - i_src[1] * src_stride[1] - - i_src[2] * src_stride[2]; - - i_dst[0] = i_src[dorder.at(0)]; - i_dst[1] = i_src[dorder.at(1)]; - i_dst[2] = i_src[dorder.at(2)]; - i_dst[3] = i_src[dorder.at(3)]; - - dst_index = i_dst[0] * dst_stride[0] + i_dst[1] * dst_stride[1] + - i_dst[2] * dst_stride[2] + i_dst[3] * dst_stride[3]; - dst[dst_index] = src[src_index]; - } - } - } -} - -template -inline __device__ void general_4d_reorder_8x256(T* dst, - T* src, - uint32_t dim_0, - uint32_t dim_1, - uint32_t dim_2, - uint32_t dim_3, - uint32_t dim_stride, - uint32_t dim_total, - uint32_t magic_stride0, - uint32_t shift_stride0, - uint32_t magic_stride1, - uint32_t shift_stride1, - uint32_t magic_stride2, - uint32_t shift_stride2) -{ - constexpr auto dorder = dst_order{}; - uint32_t pixel_total = dim_0 * dim_1 * dim_2 * dim_3; - uint32_t src_index, dst_index; - const uint64_t src_dim[4] = {dim_0, dim_1, dim_2, dim_3}; - const uint64_t dst_dim[4] = { - src_dim[dorder.at(0)], src_dim[dorder.at(1)], src_dim[dorder.at(2)], src_dim[dorder.at(3)]}; - const uint64_t src_stride[4] = { - src_dim[1] * src_dim[2] * src_dim[3], src_dim[2] * src_dim[3], src_dim[3], 1}; - const uint64_t dst_stride[4] = { - dst_dim[1] * dst_dim[2] * dst_dim[3], dst_dim[2] * dst_dim[3], dst_dim[3], 1}; - - uint32_t i_src[4] = {0, 0, 0, 0}; - uint32_t i_dst[4] = {0, 0, 0, 0}; - - for(uint32_t dim_id = blockIdx.x; dim_id < dim_total; dim_id += dim_stride) - { - for(uint32_t k = 0; k < 8; k++) - { - // unroll k block thread - src_index = k * dim_total * 256 + dim_id * 256 + threadIdx.x; - if(src_index < pixel_total) - { - i_src[0] = magic_div_u32(src_index, magic_stride0, shift_stride0); - i_src[1] = magic_div_u32( - src_index - i_src[0] * src_stride[0], magic_stride1, shift_stride1); - i_src[2] = - magic_div_u32(src_index - i_src[0] * src_stride[0] - i_src[1] * src_stride[1], - magic_stride2, - shift_stride2); - i_src[3] = src_index - i_src[0] * src_stride[0] - i_src[1] * src_stride[1] - - i_src[2] * src_stride[2]; - - i_dst[0] = i_src[dorder.at(0)]; - i_dst[1] = i_src[dorder.at(1)]; - i_dst[2] = i_src[dorder.at(2)]; - i_dst[3] = i_src[dorder.at(3)]; - - dst_index = i_dst[0] * dst_stride[0] + i_dst[1] * dst_stride[1] + - i_dst[2] * dst_stride[2] + i_dst[3] * dst_stride[3]; - dst[dst_index] = src[src_index]; - } - } - } -} - -template -inline __device__ void general_4d_reorder_16x256(T* dst, - T* src, - uint32_t dim_0, - uint32_t dim_1, - uint32_t dim_2, - uint32_t dim_3, - uint32_t dim_stride, - uint32_t dim_total, - uint32_t magic_stride0, - uint32_t shift_stride0, - uint32_t magic_stride1, - uint32_t shift_stride1, - uint32_t magic_stride2, - uint32_t shift_stride2) -{ - constexpr auto dorder = dst_order{}; - uint32_t pixel_total = dim_0 * dim_1 * dim_2 * dim_3; - uint32_t src_index, dst_index; - const uint64_t src_dim[4] = {dim_0, dim_1, dim_2, dim_3}; - const uint64_t dst_dim[4] = { - src_dim[dorder.at(0)], src_dim[dorder.at(1)], src_dim[dorder.at(2)], src_dim[dorder.at(3)]}; - const uint64_t src_stride[4] = { - src_dim[1] * src_dim[2] * src_dim[3], src_dim[2] * src_dim[3], src_dim[3], 1}; - const uint64_t dst_stride[4] = { - dst_dim[1] * dst_dim[2] * dst_dim[3], dst_dim[2] * dst_dim[3], dst_dim[3], 1}; - - uint32_t i_src[4] = {0, 0, 0, 0}; - uint32_t i_dst[4] = {0, 0, 0, 0}; - - for(uint32_t dim_id = blockIdx.x; dim_id < dim_total; dim_id += dim_stride) - { - for(uint32_t k = 0; k < 16; k++) - { - // unroll k block thread - src_index = k * dim_total * 256 + dim_id * 256 + threadIdx.x; - if(src_index < pixel_total) - { - i_src[0] = magic_div_u32(src_index, magic_stride0, shift_stride0); - i_src[1] = magic_div_u32( - src_index - i_src[0] * src_stride[0], magic_stride1, shift_stride1); - i_src[2] = - magic_div_u32(src_index - i_src[0] * src_stride[0] - i_src[1] * src_stride[1], - magic_stride2, - shift_stride2); - i_src[3] = src_index - i_src[0] * src_stride[0] - i_src[1] * src_stride[1] - - i_src[2] * src_stride[2]; - - i_dst[0] = i_src[dorder.at(0)]; - i_dst[1] = i_src[dorder.at(1)]; - i_dst[2] = i_src[dorder.at(2)]; - i_dst[3] = i_src[dorder.at(3)]; - - dst_index = i_dst[0] * dst_stride[0] + i_dst[1] * dst_stride[1] + - i_dst[2] * dst_stride[2] + i_dst[3] * dst_stride[3]; - dst[dst_index] = src[src_index]; - } - } - } -} - -#define DEFINE_GENERAL_4D_REORDER_KERNEL(tile_trait, \ - dst_order, \ - accept_data_type, \ - cast_data_type, \ - lb_threads_per_block, \ - lb_blocks_per_cu) \ - extern "C" __global__ void __launch_bounds__(lb_threads_per_block, lb_blocks_per_cu) \ - general_4d_reorder_##tile_trait##_##accept_data_type##_##dst_order(void* dst, \ - void* src, \ - uint32_t dim_0, \ - uint32_t dim_1, \ - uint32_t dim_2, \ - uint32_t dim_3, \ - uint32_t dim_stride, \ - uint32_t dim_total, \ - uint32_t magic_stride0, \ - uint32_t shift_stride0, \ - uint32_t magic_stride1, \ - uint32_t shift_stride1, \ - uint32_t magic_stride2, \ - uint32_t shift_stride2) \ - { \ - general_4d_reorder_##tile_trait( \ - reinterpret_cast(dst), \ - reinterpret_cast(src), \ - dim_0, \ - dim_1, \ - dim_2, \ - dim_3, \ - dim_stride, \ - dim_total, \ - magic_stride0, \ - shift_stride0, \ - magic_stride1, \ - shift_stride1, \ - magic_stride2, \ - shift_stride2); \ - } -// default order is 0 1 2 3 -using r0132 = order<0, 1, 3, 2>; -using r0213 = order<0, 2, 1, 3>; // nhwc2nchwc -using r0231 = order<0, 2, 3, 1>; // nchw2nchwc -using r0312 = order<0, 3, 1, 2>; // nhwc2nchw -using r0321 = order<0, 3, 2, 1>; -using r1023 = order<1, 0, 2, 3>; -using r1032 = order<1, 0, 3, 2>; -using r1203 = order<1, 2, 0, 3>; -using r1230 = order<1, 2, 3, 0>; -using r1302 = order<1, 3, 0, 2>; // nchw2chwnc -using r1320 = order<1, 3, 2, 0>; -using r2013 = order<2, 0, 1, 3>; -using r2031 = order<2, 0, 3, 1>; -using r2103 = order<2, 1, 0, 3>; // nhwc2chwnc -using r2130 = order<2, 1, 3, 0>; -using r2301 = order<2, 3, 0, 1>; -using r2310 = order<2, 3, 1, 0>; -using r3012 = order<3, 0, 1, 2>; -using r3021 = order<3, 0, 2, 1>; -using r3102 = order<3, 1, 0, 2>; -using r3120 = order<3, 1, 2, 0>; -using r3201 = order<3, 2, 0, 1>; -using r3210 = order<3, 2, 1, 0>; - -DEFINE_GENERAL_4D_REORDER_KERNEL(1x256, r0132, dwordx2, double, 256, TENSOR_REORDER_OCCUPANCY) -DEFINE_GENERAL_4D_REORDER_KERNEL(1x256, r0213, dwordx2, double, 256, TENSOR_REORDER_OCCUPANCY) -DEFINE_GENERAL_4D_REORDER_KERNEL(1x256, r0231, dwordx2, double, 256, TENSOR_REORDER_OCCUPANCY) -DEFINE_GENERAL_4D_REORDER_KERNEL(1x256, r0312, dwordx2, double, 256, TENSOR_REORDER_OCCUPANCY) -DEFINE_GENERAL_4D_REORDER_KERNEL(1x256, r0321, dwordx2, double, 256, TENSOR_REORDER_OCCUPANCY) -DEFINE_GENERAL_4D_REORDER_KERNEL(1x256, r1023, dwordx2, double, 256, TENSOR_REORDER_OCCUPANCY) -DEFINE_GENERAL_4D_REORDER_KERNEL(1x256, r1032, dwordx2, double, 256, TENSOR_REORDER_OCCUPANCY) -DEFINE_GENERAL_4D_REORDER_KERNEL(1x256, r1203, dwordx2, double, 256, TENSOR_REORDER_OCCUPANCY) -DEFINE_GENERAL_4D_REORDER_KERNEL(1x256, r1230, dwordx2, double, 256, TENSOR_REORDER_OCCUPANCY) -DEFINE_GENERAL_4D_REORDER_KERNEL(1x256, r1302, dwordx2, double, 256, TENSOR_REORDER_OCCUPANCY) -DEFINE_GENERAL_4D_REORDER_KERNEL(1x256, r1320, dwordx2, double, 256, TENSOR_REORDER_OCCUPANCY) -DEFINE_GENERAL_4D_REORDER_KERNEL(1x256, r2013, dwordx2, double, 256, TENSOR_REORDER_OCCUPANCY) -DEFINE_GENERAL_4D_REORDER_KERNEL(1x256, r2031, dwordx2, double, 256, TENSOR_REORDER_OCCUPANCY) -DEFINE_GENERAL_4D_REORDER_KERNEL(1x256, r2103, dwordx2, double, 256, TENSOR_REORDER_OCCUPANCY) -DEFINE_GENERAL_4D_REORDER_KERNEL(1x256, r2130, dwordx2, double, 256, TENSOR_REORDER_OCCUPANCY) -DEFINE_GENERAL_4D_REORDER_KERNEL(1x256, r2301, dwordx2, double, 256, TENSOR_REORDER_OCCUPANCY) -DEFINE_GENERAL_4D_REORDER_KERNEL(1x256, r2310, dwordx2, double, 256, TENSOR_REORDER_OCCUPANCY) -DEFINE_GENERAL_4D_REORDER_KERNEL(1x256, r3012, dwordx2, double, 256, TENSOR_REORDER_OCCUPANCY) -DEFINE_GENERAL_4D_REORDER_KERNEL(1x256, r3021, dwordx2, double, 256, TENSOR_REORDER_OCCUPANCY) -DEFINE_GENERAL_4D_REORDER_KERNEL(1x256, r3102, dwordx2, double, 256, TENSOR_REORDER_OCCUPANCY) -DEFINE_GENERAL_4D_REORDER_KERNEL(1x256, r3120, dwordx2, double, 256, TENSOR_REORDER_OCCUPANCY) -DEFINE_GENERAL_4D_REORDER_KERNEL(1x256, r3201, dwordx2, double, 256, TENSOR_REORDER_OCCUPANCY) -DEFINE_GENERAL_4D_REORDER_KERNEL(1x256, r3210, dwordx2, double, 256, TENSOR_REORDER_OCCUPANCY) - -DEFINE_GENERAL_4D_REORDER_KERNEL(2x256, r0132, dwordx2, double, 256, TENSOR_REORDER_OCCUPANCY) -DEFINE_GENERAL_4D_REORDER_KERNEL(2x256, r0213, dwordx2, double, 256, TENSOR_REORDER_OCCUPANCY) -DEFINE_GENERAL_4D_REORDER_KERNEL(2x256, r0231, dwordx2, double, 256, TENSOR_REORDER_OCCUPANCY) -DEFINE_GENERAL_4D_REORDER_KERNEL(2x256, r0312, dwordx2, double, 256, TENSOR_REORDER_OCCUPANCY) -DEFINE_GENERAL_4D_REORDER_KERNEL(2x256, r0321, dwordx2, double, 256, TENSOR_REORDER_OCCUPANCY) -DEFINE_GENERAL_4D_REORDER_KERNEL(2x256, r1023, dwordx2, double, 256, TENSOR_REORDER_OCCUPANCY) -DEFINE_GENERAL_4D_REORDER_KERNEL(2x256, r1032, dwordx2, double, 256, TENSOR_REORDER_OCCUPANCY) -DEFINE_GENERAL_4D_REORDER_KERNEL(2x256, r1203, dwordx2, double, 256, TENSOR_REORDER_OCCUPANCY) -DEFINE_GENERAL_4D_REORDER_KERNEL(2x256, r1230, dwordx2, double, 256, TENSOR_REORDER_OCCUPANCY) -DEFINE_GENERAL_4D_REORDER_KERNEL(2x256, r1302, dwordx2, double, 256, TENSOR_REORDER_OCCUPANCY) -DEFINE_GENERAL_4D_REORDER_KERNEL(2x256, r1320, dwordx2, double, 256, TENSOR_REORDER_OCCUPANCY) -DEFINE_GENERAL_4D_REORDER_KERNEL(2x256, r2013, dwordx2, double, 256, TENSOR_REORDER_OCCUPANCY) -DEFINE_GENERAL_4D_REORDER_KERNEL(2x256, r2031, dwordx2, double, 256, TENSOR_REORDER_OCCUPANCY) -DEFINE_GENERAL_4D_REORDER_KERNEL(2x256, r2103, dwordx2, double, 256, TENSOR_REORDER_OCCUPANCY) -DEFINE_GENERAL_4D_REORDER_KERNEL(2x256, r2130, dwordx2, double, 256, TENSOR_REORDER_OCCUPANCY) -DEFINE_GENERAL_4D_REORDER_KERNEL(2x256, r2301, dwordx2, double, 256, TENSOR_REORDER_OCCUPANCY) -DEFINE_GENERAL_4D_REORDER_KERNEL(2x256, r2310, dwordx2, double, 256, TENSOR_REORDER_OCCUPANCY) -DEFINE_GENERAL_4D_REORDER_KERNEL(2x256, r3012, dwordx2, double, 256, TENSOR_REORDER_OCCUPANCY) -DEFINE_GENERAL_4D_REORDER_KERNEL(2x256, r3021, dwordx2, double, 256, TENSOR_REORDER_OCCUPANCY) -DEFINE_GENERAL_4D_REORDER_KERNEL(2x256, r3102, dwordx2, double, 256, TENSOR_REORDER_OCCUPANCY) -DEFINE_GENERAL_4D_REORDER_KERNEL(2x256, r3120, dwordx2, double, 256, TENSOR_REORDER_OCCUPANCY) -DEFINE_GENERAL_4D_REORDER_KERNEL(2x256, r3201, dwordx2, double, 256, TENSOR_REORDER_OCCUPANCY) -DEFINE_GENERAL_4D_REORDER_KERNEL(2x256, r3210, dwordx2, double, 256, TENSOR_REORDER_OCCUPANCY) - -DEFINE_GENERAL_4D_REORDER_KERNEL(4x256, r0132, dwordx2, double, 256, TENSOR_REORDER_OCCUPANCY) -DEFINE_GENERAL_4D_REORDER_KERNEL(4x256, r0213, dwordx2, double, 256, TENSOR_REORDER_OCCUPANCY) -DEFINE_GENERAL_4D_REORDER_KERNEL(4x256, r0231, dwordx2, double, 256, TENSOR_REORDER_OCCUPANCY) -DEFINE_GENERAL_4D_REORDER_KERNEL(4x256, r0312, dwordx2, double, 256, TENSOR_REORDER_OCCUPANCY) -DEFINE_GENERAL_4D_REORDER_KERNEL(4x256, r0321, dwordx2, double, 256, TENSOR_REORDER_OCCUPANCY) -DEFINE_GENERAL_4D_REORDER_KERNEL(4x256, r1023, dwordx2, double, 256, TENSOR_REORDER_OCCUPANCY) -DEFINE_GENERAL_4D_REORDER_KERNEL(4x256, r1032, dwordx2, double, 256, TENSOR_REORDER_OCCUPANCY) -DEFINE_GENERAL_4D_REORDER_KERNEL(4x256, r1203, dwordx2, double, 256, TENSOR_REORDER_OCCUPANCY) -DEFINE_GENERAL_4D_REORDER_KERNEL(4x256, r1230, dwordx2, double, 256, TENSOR_REORDER_OCCUPANCY) -DEFINE_GENERAL_4D_REORDER_KERNEL(4x256, r1302, dwordx2, double, 256, TENSOR_REORDER_OCCUPANCY) -DEFINE_GENERAL_4D_REORDER_KERNEL(4x256, r1320, dwordx2, double, 256, TENSOR_REORDER_OCCUPANCY) -DEFINE_GENERAL_4D_REORDER_KERNEL(4x256, r2013, dwordx2, double, 256, TENSOR_REORDER_OCCUPANCY) -DEFINE_GENERAL_4D_REORDER_KERNEL(4x256, r2031, dwordx2, double, 256, TENSOR_REORDER_OCCUPANCY) -DEFINE_GENERAL_4D_REORDER_KERNEL(4x256, r2103, dwordx2, double, 256, TENSOR_REORDER_OCCUPANCY) -DEFINE_GENERAL_4D_REORDER_KERNEL(4x256, r2130, dwordx2, double, 256, TENSOR_REORDER_OCCUPANCY) -DEFINE_GENERAL_4D_REORDER_KERNEL(4x256, r2301, dwordx2, double, 256, TENSOR_REORDER_OCCUPANCY) -DEFINE_GENERAL_4D_REORDER_KERNEL(4x256, r2310, dwordx2, double, 256, TENSOR_REORDER_OCCUPANCY) -DEFINE_GENERAL_4D_REORDER_KERNEL(4x256, r3012, dwordx2, double, 256, TENSOR_REORDER_OCCUPANCY) -DEFINE_GENERAL_4D_REORDER_KERNEL(4x256, r3021, dwordx2, double, 256, TENSOR_REORDER_OCCUPANCY) -DEFINE_GENERAL_4D_REORDER_KERNEL(4x256, r3102, dwordx2, double, 256, TENSOR_REORDER_OCCUPANCY) -DEFINE_GENERAL_4D_REORDER_KERNEL(4x256, r3120, dwordx2, double, 256, TENSOR_REORDER_OCCUPANCY) -DEFINE_GENERAL_4D_REORDER_KERNEL(4x256, r3201, dwordx2, double, 256, TENSOR_REORDER_OCCUPANCY) -DEFINE_GENERAL_4D_REORDER_KERNEL(4x256, r3210, dwordx2, double, 256, TENSOR_REORDER_OCCUPANCY) - -DEFINE_GENERAL_4D_REORDER_KERNEL(8x256, r0132, dwordx2, double, 256, TENSOR_REORDER_OCCUPANCY) -DEFINE_GENERAL_4D_REORDER_KERNEL(8x256, r0213, dwordx2, double, 256, TENSOR_REORDER_OCCUPANCY) -DEFINE_GENERAL_4D_REORDER_KERNEL(8x256, r0231, dwordx2, double, 256, TENSOR_REORDER_OCCUPANCY) -DEFINE_GENERAL_4D_REORDER_KERNEL(8x256, r0312, dwordx2, double, 256, TENSOR_REORDER_OCCUPANCY) -DEFINE_GENERAL_4D_REORDER_KERNEL(8x256, r0321, dwordx2, double, 256, TENSOR_REORDER_OCCUPANCY) -DEFINE_GENERAL_4D_REORDER_KERNEL(8x256, r1023, dwordx2, double, 256, TENSOR_REORDER_OCCUPANCY) -DEFINE_GENERAL_4D_REORDER_KERNEL(8x256, r1032, dwordx2, double, 256, TENSOR_REORDER_OCCUPANCY) -DEFINE_GENERAL_4D_REORDER_KERNEL(8x256, r1203, dwordx2, double, 256, TENSOR_REORDER_OCCUPANCY) -DEFINE_GENERAL_4D_REORDER_KERNEL(8x256, r1230, dwordx2, double, 256, TENSOR_REORDER_OCCUPANCY) -DEFINE_GENERAL_4D_REORDER_KERNEL(8x256, r1302, dwordx2, double, 256, TENSOR_REORDER_OCCUPANCY) -DEFINE_GENERAL_4D_REORDER_KERNEL(8x256, r1320, dwordx2, double, 256, TENSOR_REORDER_OCCUPANCY) -DEFINE_GENERAL_4D_REORDER_KERNEL(8x256, r2013, dwordx2, double, 256, TENSOR_REORDER_OCCUPANCY) -DEFINE_GENERAL_4D_REORDER_KERNEL(8x256, r2031, dwordx2, double, 256, TENSOR_REORDER_OCCUPANCY) -DEFINE_GENERAL_4D_REORDER_KERNEL(8x256, r2103, dwordx2, double, 256, TENSOR_REORDER_OCCUPANCY) -DEFINE_GENERAL_4D_REORDER_KERNEL(8x256, r2130, dwordx2, double, 256, TENSOR_REORDER_OCCUPANCY) -DEFINE_GENERAL_4D_REORDER_KERNEL(8x256, r2301, dwordx2, double, 256, TENSOR_REORDER_OCCUPANCY) -DEFINE_GENERAL_4D_REORDER_KERNEL(8x256, r2310, dwordx2, double, 256, TENSOR_REORDER_OCCUPANCY) -DEFINE_GENERAL_4D_REORDER_KERNEL(8x256, r3012, dwordx2, double, 256, TENSOR_REORDER_OCCUPANCY) -DEFINE_GENERAL_4D_REORDER_KERNEL(8x256, r3021, dwordx2, double, 256, TENSOR_REORDER_OCCUPANCY) -DEFINE_GENERAL_4D_REORDER_KERNEL(8x256, r3102, dwordx2, double, 256, TENSOR_REORDER_OCCUPANCY) -DEFINE_GENERAL_4D_REORDER_KERNEL(8x256, r3120, dwordx2, double, 256, TENSOR_REORDER_OCCUPANCY) -DEFINE_GENERAL_4D_REORDER_KERNEL(8x256, r3201, dwordx2, double, 256, TENSOR_REORDER_OCCUPANCY) -DEFINE_GENERAL_4D_REORDER_KERNEL(8x256, r3210, dwordx2, double, 256, TENSOR_REORDER_OCCUPANCY) - -DEFINE_GENERAL_4D_REORDER_KERNEL(16x256, r0132, dwordx2, double, 256, TENSOR_REORDER_OCCUPANCY) -DEFINE_GENERAL_4D_REORDER_KERNEL(16x256, r0213, dwordx2, double, 256, TENSOR_REORDER_OCCUPANCY) -DEFINE_GENERAL_4D_REORDER_KERNEL(16x256, r0231, dwordx2, double, 256, TENSOR_REORDER_OCCUPANCY) -DEFINE_GENERAL_4D_REORDER_KERNEL(16x256, r0312, dwordx2, double, 256, TENSOR_REORDER_OCCUPANCY) -DEFINE_GENERAL_4D_REORDER_KERNEL(16x256, r0321, dwordx2, double, 256, TENSOR_REORDER_OCCUPANCY) -DEFINE_GENERAL_4D_REORDER_KERNEL(16x256, r1023, dwordx2, double, 256, TENSOR_REORDER_OCCUPANCY) -DEFINE_GENERAL_4D_REORDER_KERNEL(16x256, r1032, dwordx2, double, 256, TENSOR_REORDER_OCCUPANCY) -DEFINE_GENERAL_4D_REORDER_KERNEL(16x256, r1203, dwordx2, double, 256, TENSOR_REORDER_OCCUPANCY) -DEFINE_GENERAL_4D_REORDER_KERNEL(16x256, r1230, dwordx2, double, 256, TENSOR_REORDER_OCCUPANCY) -DEFINE_GENERAL_4D_REORDER_KERNEL(16x256, r1302, dwordx2, double, 256, TENSOR_REORDER_OCCUPANCY) -DEFINE_GENERAL_4D_REORDER_KERNEL(16x256, r1320, dwordx2, double, 256, TENSOR_REORDER_OCCUPANCY) -DEFINE_GENERAL_4D_REORDER_KERNEL(16x256, r2013, dwordx2, double, 256, TENSOR_REORDER_OCCUPANCY) -DEFINE_GENERAL_4D_REORDER_KERNEL(16x256, r2031, dwordx2, double, 256, TENSOR_REORDER_OCCUPANCY) -DEFINE_GENERAL_4D_REORDER_KERNEL(16x256, r2103, dwordx2, double, 256, TENSOR_REORDER_OCCUPANCY) -DEFINE_GENERAL_4D_REORDER_KERNEL(16x256, r2130, dwordx2, double, 256, TENSOR_REORDER_OCCUPANCY) -DEFINE_GENERAL_4D_REORDER_KERNEL(16x256, r2301, dwordx2, double, 256, TENSOR_REORDER_OCCUPANCY) -DEFINE_GENERAL_4D_REORDER_KERNEL(16x256, r2310, dwordx2, double, 256, TENSOR_REORDER_OCCUPANCY) -DEFINE_GENERAL_4D_REORDER_KERNEL(16x256, r3012, dwordx2, double, 256, TENSOR_REORDER_OCCUPANCY) -DEFINE_GENERAL_4D_REORDER_KERNEL(16x256, r3021, dwordx2, double, 256, TENSOR_REORDER_OCCUPANCY) -DEFINE_GENERAL_4D_REORDER_KERNEL(16x256, r3102, dwordx2, double, 256, TENSOR_REORDER_OCCUPANCY) -DEFINE_GENERAL_4D_REORDER_KERNEL(16x256, r3120, dwordx2, double, 256, TENSOR_REORDER_OCCUPANCY) -DEFINE_GENERAL_4D_REORDER_KERNEL(16x256, r3201, dwordx2, double, 256, TENSOR_REORDER_OCCUPANCY) -DEFINE_GENERAL_4D_REORDER_KERNEL(16x256, r3210, dwordx2, double, 256, TENSOR_REORDER_OCCUPANCY) - -DEFINE_GENERAL_4D_REORDER_KERNEL(1x256, r0132, dword, float, 256, TENSOR_REORDER_OCCUPANCY) -DEFINE_GENERAL_4D_REORDER_KERNEL(1x256, r0213, dword, float, 256, TENSOR_REORDER_OCCUPANCY) -DEFINE_GENERAL_4D_REORDER_KERNEL(1x256, r0231, dword, float, 256, TENSOR_REORDER_OCCUPANCY) -DEFINE_GENERAL_4D_REORDER_KERNEL(1x256, r0312, dword, float, 256, TENSOR_REORDER_OCCUPANCY) -DEFINE_GENERAL_4D_REORDER_KERNEL(1x256, r0321, dword, float, 256, TENSOR_REORDER_OCCUPANCY) -DEFINE_GENERAL_4D_REORDER_KERNEL(1x256, r1023, dword, float, 256, TENSOR_REORDER_OCCUPANCY) -DEFINE_GENERAL_4D_REORDER_KERNEL(1x256, r1032, dword, float, 256, TENSOR_REORDER_OCCUPANCY) -DEFINE_GENERAL_4D_REORDER_KERNEL(1x256, r1203, dword, float, 256, TENSOR_REORDER_OCCUPANCY) -DEFINE_GENERAL_4D_REORDER_KERNEL(1x256, r1230, dword, float, 256, TENSOR_REORDER_OCCUPANCY) -DEFINE_GENERAL_4D_REORDER_KERNEL(1x256, r1302, dword, float, 256, TENSOR_REORDER_OCCUPANCY) -DEFINE_GENERAL_4D_REORDER_KERNEL(1x256, r1320, dword, float, 256, TENSOR_REORDER_OCCUPANCY) -DEFINE_GENERAL_4D_REORDER_KERNEL(1x256, r2013, dword, float, 256, TENSOR_REORDER_OCCUPANCY) -DEFINE_GENERAL_4D_REORDER_KERNEL(1x256, r2031, dword, float, 256, TENSOR_REORDER_OCCUPANCY) -DEFINE_GENERAL_4D_REORDER_KERNEL(1x256, r2103, dword, float, 256, TENSOR_REORDER_OCCUPANCY) -DEFINE_GENERAL_4D_REORDER_KERNEL(1x256, r2130, dword, float, 256, TENSOR_REORDER_OCCUPANCY) -DEFINE_GENERAL_4D_REORDER_KERNEL(1x256, r2301, dword, float, 256, TENSOR_REORDER_OCCUPANCY) -DEFINE_GENERAL_4D_REORDER_KERNEL(1x256, r2310, dword, float, 256, TENSOR_REORDER_OCCUPANCY) -DEFINE_GENERAL_4D_REORDER_KERNEL(1x256, r3012, dword, float, 256, TENSOR_REORDER_OCCUPANCY) -DEFINE_GENERAL_4D_REORDER_KERNEL(1x256, r3021, dword, float, 256, TENSOR_REORDER_OCCUPANCY) -DEFINE_GENERAL_4D_REORDER_KERNEL(1x256, r3102, dword, float, 256, TENSOR_REORDER_OCCUPANCY) -DEFINE_GENERAL_4D_REORDER_KERNEL(1x256, r3120, dword, float, 256, TENSOR_REORDER_OCCUPANCY) -DEFINE_GENERAL_4D_REORDER_KERNEL(1x256, r3201, dword, float, 256, TENSOR_REORDER_OCCUPANCY) -DEFINE_GENERAL_4D_REORDER_KERNEL(1x256, r3210, dword, float, 256, TENSOR_REORDER_OCCUPANCY) - -DEFINE_GENERAL_4D_REORDER_KERNEL(2x256, r0132, dword, float, 256, TENSOR_REORDER_OCCUPANCY) -DEFINE_GENERAL_4D_REORDER_KERNEL(2x256, r0213, dword, float, 256, TENSOR_REORDER_OCCUPANCY) -DEFINE_GENERAL_4D_REORDER_KERNEL(2x256, r0231, dword, float, 256, TENSOR_REORDER_OCCUPANCY) -DEFINE_GENERAL_4D_REORDER_KERNEL(2x256, r0312, dword, float, 256, TENSOR_REORDER_OCCUPANCY) -DEFINE_GENERAL_4D_REORDER_KERNEL(2x256, r0321, dword, float, 256, TENSOR_REORDER_OCCUPANCY) -DEFINE_GENERAL_4D_REORDER_KERNEL(2x256, r1023, dword, float, 256, TENSOR_REORDER_OCCUPANCY) -DEFINE_GENERAL_4D_REORDER_KERNEL(2x256, r1032, dword, float, 256, TENSOR_REORDER_OCCUPANCY) -DEFINE_GENERAL_4D_REORDER_KERNEL(2x256, r1203, dword, float, 256, TENSOR_REORDER_OCCUPANCY) -DEFINE_GENERAL_4D_REORDER_KERNEL(2x256, r1230, dword, float, 256, TENSOR_REORDER_OCCUPANCY) -DEFINE_GENERAL_4D_REORDER_KERNEL(2x256, r1302, dword, float, 256, TENSOR_REORDER_OCCUPANCY) -DEFINE_GENERAL_4D_REORDER_KERNEL(2x256, r1320, dword, float, 256, TENSOR_REORDER_OCCUPANCY) -DEFINE_GENERAL_4D_REORDER_KERNEL(2x256, r2013, dword, float, 256, TENSOR_REORDER_OCCUPANCY) -DEFINE_GENERAL_4D_REORDER_KERNEL(2x256, r2031, dword, float, 256, TENSOR_REORDER_OCCUPANCY) -DEFINE_GENERAL_4D_REORDER_KERNEL(2x256, r2103, dword, float, 256, TENSOR_REORDER_OCCUPANCY) -DEFINE_GENERAL_4D_REORDER_KERNEL(2x256, r2130, dword, float, 256, TENSOR_REORDER_OCCUPANCY) -DEFINE_GENERAL_4D_REORDER_KERNEL(2x256, r2301, dword, float, 256, TENSOR_REORDER_OCCUPANCY) -DEFINE_GENERAL_4D_REORDER_KERNEL(2x256, r2310, dword, float, 256, TENSOR_REORDER_OCCUPANCY) -DEFINE_GENERAL_4D_REORDER_KERNEL(2x256, r3012, dword, float, 256, TENSOR_REORDER_OCCUPANCY) -DEFINE_GENERAL_4D_REORDER_KERNEL(2x256, r3021, dword, float, 256, TENSOR_REORDER_OCCUPANCY) -DEFINE_GENERAL_4D_REORDER_KERNEL(2x256, r3102, dword, float, 256, TENSOR_REORDER_OCCUPANCY) -DEFINE_GENERAL_4D_REORDER_KERNEL(2x256, r3120, dword, float, 256, TENSOR_REORDER_OCCUPANCY) -DEFINE_GENERAL_4D_REORDER_KERNEL(2x256, r3201, dword, float, 256, TENSOR_REORDER_OCCUPANCY) -DEFINE_GENERAL_4D_REORDER_KERNEL(2x256, r3210, dword, float, 256, TENSOR_REORDER_OCCUPANCY) - -DEFINE_GENERAL_4D_REORDER_KERNEL(4x256, r0132, dword, float, 256, TENSOR_REORDER_OCCUPANCY) -DEFINE_GENERAL_4D_REORDER_KERNEL(4x256, r0213, dword, float, 256, TENSOR_REORDER_OCCUPANCY) -DEFINE_GENERAL_4D_REORDER_KERNEL(4x256, r0231, dword, float, 256, TENSOR_REORDER_OCCUPANCY) -DEFINE_GENERAL_4D_REORDER_KERNEL(4x256, r0312, dword, float, 256, TENSOR_REORDER_OCCUPANCY) -DEFINE_GENERAL_4D_REORDER_KERNEL(4x256, r0321, dword, float, 256, TENSOR_REORDER_OCCUPANCY) -DEFINE_GENERAL_4D_REORDER_KERNEL(4x256, r1023, dword, float, 256, TENSOR_REORDER_OCCUPANCY) -DEFINE_GENERAL_4D_REORDER_KERNEL(4x256, r1032, dword, float, 256, TENSOR_REORDER_OCCUPANCY) -DEFINE_GENERAL_4D_REORDER_KERNEL(4x256, r1203, dword, float, 256, TENSOR_REORDER_OCCUPANCY) -DEFINE_GENERAL_4D_REORDER_KERNEL(4x256, r1230, dword, float, 256, TENSOR_REORDER_OCCUPANCY) -DEFINE_GENERAL_4D_REORDER_KERNEL(4x256, r1302, dword, float, 256, TENSOR_REORDER_OCCUPANCY) -DEFINE_GENERAL_4D_REORDER_KERNEL(4x256, r1320, dword, float, 256, TENSOR_REORDER_OCCUPANCY) -DEFINE_GENERAL_4D_REORDER_KERNEL(4x256, r2013, dword, float, 256, TENSOR_REORDER_OCCUPANCY) -DEFINE_GENERAL_4D_REORDER_KERNEL(4x256, r2031, dword, float, 256, TENSOR_REORDER_OCCUPANCY) -DEFINE_GENERAL_4D_REORDER_KERNEL(4x256, r2103, dword, float, 256, TENSOR_REORDER_OCCUPANCY) -DEFINE_GENERAL_4D_REORDER_KERNEL(4x256, r2130, dword, float, 256, TENSOR_REORDER_OCCUPANCY) -DEFINE_GENERAL_4D_REORDER_KERNEL(4x256, r2301, dword, float, 256, TENSOR_REORDER_OCCUPANCY) -DEFINE_GENERAL_4D_REORDER_KERNEL(4x256, r2310, dword, float, 256, TENSOR_REORDER_OCCUPANCY) -DEFINE_GENERAL_4D_REORDER_KERNEL(4x256, r3012, dword, float, 256, TENSOR_REORDER_OCCUPANCY) -DEFINE_GENERAL_4D_REORDER_KERNEL(4x256, r3021, dword, float, 256, TENSOR_REORDER_OCCUPANCY) -DEFINE_GENERAL_4D_REORDER_KERNEL(4x256, r3102, dword, float, 256, TENSOR_REORDER_OCCUPANCY) -DEFINE_GENERAL_4D_REORDER_KERNEL(4x256, r3120, dword, float, 256, TENSOR_REORDER_OCCUPANCY) -DEFINE_GENERAL_4D_REORDER_KERNEL(4x256, r3201, dword, float, 256, TENSOR_REORDER_OCCUPANCY) -DEFINE_GENERAL_4D_REORDER_KERNEL(4x256, r3210, dword, float, 256, TENSOR_REORDER_OCCUPANCY) - -DEFINE_GENERAL_4D_REORDER_KERNEL(8x256, r0132, dword, float, 256, TENSOR_REORDER_OCCUPANCY) -DEFINE_GENERAL_4D_REORDER_KERNEL(8x256, r0213, dword, float, 256, TENSOR_REORDER_OCCUPANCY) -DEFINE_GENERAL_4D_REORDER_KERNEL(8x256, r0231, dword, float, 256, TENSOR_REORDER_OCCUPANCY) -DEFINE_GENERAL_4D_REORDER_KERNEL(8x256, r0312, dword, float, 256, TENSOR_REORDER_OCCUPANCY) -DEFINE_GENERAL_4D_REORDER_KERNEL(8x256, r0321, dword, float, 256, TENSOR_REORDER_OCCUPANCY) -DEFINE_GENERAL_4D_REORDER_KERNEL(8x256, r1023, dword, float, 256, TENSOR_REORDER_OCCUPANCY) -DEFINE_GENERAL_4D_REORDER_KERNEL(8x256, r1032, dword, float, 256, TENSOR_REORDER_OCCUPANCY) -DEFINE_GENERAL_4D_REORDER_KERNEL(8x256, r1203, dword, float, 256, TENSOR_REORDER_OCCUPANCY) -DEFINE_GENERAL_4D_REORDER_KERNEL(8x256, r1230, dword, float, 256, TENSOR_REORDER_OCCUPANCY) -DEFINE_GENERAL_4D_REORDER_KERNEL(8x256, r1302, dword, float, 256, TENSOR_REORDER_OCCUPANCY) -DEFINE_GENERAL_4D_REORDER_KERNEL(8x256, r1320, dword, float, 256, TENSOR_REORDER_OCCUPANCY) -DEFINE_GENERAL_4D_REORDER_KERNEL(8x256, r2013, dword, float, 256, TENSOR_REORDER_OCCUPANCY) -DEFINE_GENERAL_4D_REORDER_KERNEL(8x256, r2031, dword, float, 256, TENSOR_REORDER_OCCUPANCY) -DEFINE_GENERAL_4D_REORDER_KERNEL(8x256, r2103, dword, float, 256, TENSOR_REORDER_OCCUPANCY) -DEFINE_GENERAL_4D_REORDER_KERNEL(8x256, r2130, dword, float, 256, TENSOR_REORDER_OCCUPANCY) -DEFINE_GENERAL_4D_REORDER_KERNEL(8x256, r2301, dword, float, 256, TENSOR_REORDER_OCCUPANCY) -DEFINE_GENERAL_4D_REORDER_KERNEL(8x256, r2310, dword, float, 256, TENSOR_REORDER_OCCUPANCY) -DEFINE_GENERAL_4D_REORDER_KERNEL(8x256, r3012, dword, float, 256, TENSOR_REORDER_OCCUPANCY) -DEFINE_GENERAL_4D_REORDER_KERNEL(8x256, r3021, dword, float, 256, TENSOR_REORDER_OCCUPANCY) -DEFINE_GENERAL_4D_REORDER_KERNEL(8x256, r3102, dword, float, 256, TENSOR_REORDER_OCCUPANCY) -DEFINE_GENERAL_4D_REORDER_KERNEL(8x256, r3120, dword, float, 256, TENSOR_REORDER_OCCUPANCY) -DEFINE_GENERAL_4D_REORDER_KERNEL(8x256, r3201, dword, float, 256, TENSOR_REORDER_OCCUPANCY) -DEFINE_GENERAL_4D_REORDER_KERNEL(8x256, r3210, dword, float, 256, TENSOR_REORDER_OCCUPANCY) - -DEFINE_GENERAL_4D_REORDER_KERNEL(16x256, r0132, dword, float, 256, TENSOR_REORDER_OCCUPANCY) -DEFINE_GENERAL_4D_REORDER_KERNEL(16x256, r0213, dword, float, 256, TENSOR_REORDER_OCCUPANCY) -DEFINE_GENERAL_4D_REORDER_KERNEL(16x256, r0231, dword, float, 256, TENSOR_REORDER_OCCUPANCY) -DEFINE_GENERAL_4D_REORDER_KERNEL(16x256, r0312, dword, float, 256, TENSOR_REORDER_OCCUPANCY) -DEFINE_GENERAL_4D_REORDER_KERNEL(16x256, r0321, dword, float, 256, TENSOR_REORDER_OCCUPANCY) -DEFINE_GENERAL_4D_REORDER_KERNEL(16x256, r1023, dword, float, 256, TENSOR_REORDER_OCCUPANCY) -DEFINE_GENERAL_4D_REORDER_KERNEL(16x256, r1032, dword, float, 256, TENSOR_REORDER_OCCUPANCY) -DEFINE_GENERAL_4D_REORDER_KERNEL(16x256, r1203, dword, float, 256, TENSOR_REORDER_OCCUPANCY) -DEFINE_GENERAL_4D_REORDER_KERNEL(16x256, r1230, dword, float, 256, TENSOR_REORDER_OCCUPANCY) -DEFINE_GENERAL_4D_REORDER_KERNEL(16x256, r1302, dword, float, 256, TENSOR_REORDER_OCCUPANCY) -DEFINE_GENERAL_4D_REORDER_KERNEL(16x256, r1320, dword, float, 256, TENSOR_REORDER_OCCUPANCY) -DEFINE_GENERAL_4D_REORDER_KERNEL(16x256, r2013, dword, float, 256, TENSOR_REORDER_OCCUPANCY) -DEFINE_GENERAL_4D_REORDER_KERNEL(16x256, r2031, dword, float, 256, TENSOR_REORDER_OCCUPANCY) -DEFINE_GENERAL_4D_REORDER_KERNEL(16x256, r2103, dword, float, 256, TENSOR_REORDER_OCCUPANCY) -DEFINE_GENERAL_4D_REORDER_KERNEL(16x256, r2130, dword, float, 256, TENSOR_REORDER_OCCUPANCY) -DEFINE_GENERAL_4D_REORDER_KERNEL(16x256, r2301, dword, float, 256, TENSOR_REORDER_OCCUPANCY) -DEFINE_GENERAL_4D_REORDER_KERNEL(16x256, r2310, dword, float, 256, TENSOR_REORDER_OCCUPANCY) -DEFINE_GENERAL_4D_REORDER_KERNEL(16x256, r3012, dword, float, 256, TENSOR_REORDER_OCCUPANCY) -DEFINE_GENERAL_4D_REORDER_KERNEL(16x256, r3021, dword, float, 256, TENSOR_REORDER_OCCUPANCY) -DEFINE_GENERAL_4D_REORDER_KERNEL(16x256, r3102, dword, float, 256, TENSOR_REORDER_OCCUPANCY) -DEFINE_GENERAL_4D_REORDER_KERNEL(16x256, r3120, dword, float, 256, TENSOR_REORDER_OCCUPANCY) -DEFINE_GENERAL_4D_REORDER_KERNEL(16x256, r3201, dword, float, 256, TENSOR_REORDER_OCCUPANCY) -DEFINE_GENERAL_4D_REORDER_KERNEL(16x256, r3210, dword, float, 256, TENSOR_REORDER_OCCUPANCY) - -DEFINE_GENERAL_4D_REORDER_KERNEL(1x256, r0132, half, ushort, 256, TENSOR_REORDER_OCCUPANCY) -DEFINE_GENERAL_4D_REORDER_KERNEL(1x256, r0213, half, ushort, 256, TENSOR_REORDER_OCCUPANCY) -DEFINE_GENERAL_4D_REORDER_KERNEL(1x256, r0231, half, ushort, 256, TENSOR_REORDER_OCCUPANCY) -DEFINE_GENERAL_4D_REORDER_KERNEL(1x256, r0312, half, ushort, 256, TENSOR_REORDER_OCCUPANCY) -DEFINE_GENERAL_4D_REORDER_KERNEL(1x256, r0321, half, ushort, 256, TENSOR_REORDER_OCCUPANCY) -DEFINE_GENERAL_4D_REORDER_KERNEL(1x256, r1023, half, ushort, 256, TENSOR_REORDER_OCCUPANCY) -DEFINE_GENERAL_4D_REORDER_KERNEL(1x256, r1032, half, ushort, 256, TENSOR_REORDER_OCCUPANCY) -DEFINE_GENERAL_4D_REORDER_KERNEL(1x256, r1203, half, ushort, 256, TENSOR_REORDER_OCCUPANCY) -DEFINE_GENERAL_4D_REORDER_KERNEL(1x256, r1230, half, ushort, 256, TENSOR_REORDER_OCCUPANCY) -DEFINE_GENERAL_4D_REORDER_KERNEL(1x256, r1302, half, ushort, 256, TENSOR_REORDER_OCCUPANCY) -DEFINE_GENERAL_4D_REORDER_KERNEL(1x256, r1320, half, ushort, 256, TENSOR_REORDER_OCCUPANCY) -DEFINE_GENERAL_4D_REORDER_KERNEL(1x256, r2013, half, ushort, 256, TENSOR_REORDER_OCCUPANCY) -DEFINE_GENERAL_4D_REORDER_KERNEL(1x256, r2031, half, ushort, 256, TENSOR_REORDER_OCCUPANCY) -DEFINE_GENERAL_4D_REORDER_KERNEL(1x256, r2103, half, ushort, 256, TENSOR_REORDER_OCCUPANCY) -DEFINE_GENERAL_4D_REORDER_KERNEL(1x256, r2130, half, ushort, 256, TENSOR_REORDER_OCCUPANCY) -DEFINE_GENERAL_4D_REORDER_KERNEL(1x256, r2301, half, ushort, 256, TENSOR_REORDER_OCCUPANCY) -DEFINE_GENERAL_4D_REORDER_KERNEL(1x256, r2310, half, ushort, 256, TENSOR_REORDER_OCCUPANCY) -DEFINE_GENERAL_4D_REORDER_KERNEL(1x256, r3012, half, ushort, 256, TENSOR_REORDER_OCCUPANCY) -DEFINE_GENERAL_4D_REORDER_KERNEL(1x256, r3021, half, ushort, 256, TENSOR_REORDER_OCCUPANCY) -DEFINE_GENERAL_4D_REORDER_KERNEL(1x256, r3102, half, ushort, 256, TENSOR_REORDER_OCCUPANCY) -DEFINE_GENERAL_4D_REORDER_KERNEL(1x256, r3120, half, ushort, 256, TENSOR_REORDER_OCCUPANCY) -DEFINE_GENERAL_4D_REORDER_KERNEL(1x256, r3201, half, ushort, 256, TENSOR_REORDER_OCCUPANCY) -DEFINE_GENERAL_4D_REORDER_KERNEL(1x256, r3210, half, ushort, 256, TENSOR_REORDER_OCCUPANCY) - -DEFINE_GENERAL_4D_REORDER_KERNEL(2x256, r0132, half, ushort, 256, TENSOR_REORDER_OCCUPANCY) -DEFINE_GENERAL_4D_REORDER_KERNEL(2x256, r0213, half, ushort, 256, TENSOR_REORDER_OCCUPANCY) -DEFINE_GENERAL_4D_REORDER_KERNEL(2x256, r0231, half, ushort, 256, TENSOR_REORDER_OCCUPANCY) -DEFINE_GENERAL_4D_REORDER_KERNEL(2x256, r0312, half, ushort, 256, TENSOR_REORDER_OCCUPANCY) -DEFINE_GENERAL_4D_REORDER_KERNEL(2x256, r0321, half, ushort, 256, TENSOR_REORDER_OCCUPANCY) -DEFINE_GENERAL_4D_REORDER_KERNEL(2x256, r1023, half, ushort, 256, TENSOR_REORDER_OCCUPANCY) -DEFINE_GENERAL_4D_REORDER_KERNEL(2x256, r1032, half, ushort, 256, TENSOR_REORDER_OCCUPANCY) -DEFINE_GENERAL_4D_REORDER_KERNEL(2x256, r1203, half, ushort, 256, TENSOR_REORDER_OCCUPANCY) -DEFINE_GENERAL_4D_REORDER_KERNEL(2x256, r1230, half, ushort, 256, TENSOR_REORDER_OCCUPANCY) -DEFINE_GENERAL_4D_REORDER_KERNEL(2x256, r1302, half, ushort, 256, TENSOR_REORDER_OCCUPANCY) -DEFINE_GENERAL_4D_REORDER_KERNEL(2x256, r1320, half, ushort, 256, TENSOR_REORDER_OCCUPANCY) -DEFINE_GENERAL_4D_REORDER_KERNEL(2x256, r2013, half, ushort, 256, TENSOR_REORDER_OCCUPANCY) -DEFINE_GENERAL_4D_REORDER_KERNEL(2x256, r2031, half, ushort, 256, TENSOR_REORDER_OCCUPANCY) -DEFINE_GENERAL_4D_REORDER_KERNEL(2x256, r2103, half, ushort, 256, TENSOR_REORDER_OCCUPANCY) -DEFINE_GENERAL_4D_REORDER_KERNEL(2x256, r2130, half, ushort, 256, TENSOR_REORDER_OCCUPANCY) -DEFINE_GENERAL_4D_REORDER_KERNEL(2x256, r2301, half, ushort, 256, TENSOR_REORDER_OCCUPANCY) -DEFINE_GENERAL_4D_REORDER_KERNEL(2x256, r2310, half, ushort, 256, TENSOR_REORDER_OCCUPANCY) -DEFINE_GENERAL_4D_REORDER_KERNEL(2x256, r3012, half, ushort, 256, TENSOR_REORDER_OCCUPANCY) -DEFINE_GENERAL_4D_REORDER_KERNEL(2x256, r3021, half, ushort, 256, TENSOR_REORDER_OCCUPANCY) -DEFINE_GENERAL_4D_REORDER_KERNEL(2x256, r3102, half, ushort, 256, TENSOR_REORDER_OCCUPANCY) -DEFINE_GENERAL_4D_REORDER_KERNEL(2x256, r3120, half, ushort, 256, TENSOR_REORDER_OCCUPANCY) -DEFINE_GENERAL_4D_REORDER_KERNEL(2x256, r3201, half, ushort, 256, TENSOR_REORDER_OCCUPANCY) -DEFINE_GENERAL_4D_REORDER_KERNEL(2x256, r3210, half, ushort, 256, TENSOR_REORDER_OCCUPANCY) - -DEFINE_GENERAL_4D_REORDER_KERNEL(4x256, r0132, half, ushort, 256, TENSOR_REORDER_OCCUPANCY) -DEFINE_GENERAL_4D_REORDER_KERNEL(4x256, r0213, half, ushort, 256, TENSOR_REORDER_OCCUPANCY) -DEFINE_GENERAL_4D_REORDER_KERNEL(4x256, r0231, half, ushort, 256, TENSOR_REORDER_OCCUPANCY) -DEFINE_GENERAL_4D_REORDER_KERNEL(4x256, r0312, half, ushort, 256, TENSOR_REORDER_OCCUPANCY) -DEFINE_GENERAL_4D_REORDER_KERNEL(4x256, r0321, half, ushort, 256, TENSOR_REORDER_OCCUPANCY) -DEFINE_GENERAL_4D_REORDER_KERNEL(4x256, r1023, half, ushort, 256, TENSOR_REORDER_OCCUPANCY) -DEFINE_GENERAL_4D_REORDER_KERNEL(4x256, r1032, half, ushort, 256, TENSOR_REORDER_OCCUPANCY) -DEFINE_GENERAL_4D_REORDER_KERNEL(4x256, r1203, half, ushort, 256, TENSOR_REORDER_OCCUPANCY) -DEFINE_GENERAL_4D_REORDER_KERNEL(4x256, r1230, half, ushort, 256, TENSOR_REORDER_OCCUPANCY) -DEFINE_GENERAL_4D_REORDER_KERNEL(4x256, r1302, half, ushort, 256, TENSOR_REORDER_OCCUPANCY) -DEFINE_GENERAL_4D_REORDER_KERNEL(4x256, r1320, half, ushort, 256, TENSOR_REORDER_OCCUPANCY) -DEFINE_GENERAL_4D_REORDER_KERNEL(4x256, r2013, half, ushort, 256, TENSOR_REORDER_OCCUPANCY) -DEFINE_GENERAL_4D_REORDER_KERNEL(4x256, r2031, half, ushort, 256, TENSOR_REORDER_OCCUPANCY) -DEFINE_GENERAL_4D_REORDER_KERNEL(4x256, r2103, half, ushort, 256, TENSOR_REORDER_OCCUPANCY) -DEFINE_GENERAL_4D_REORDER_KERNEL(4x256, r2130, half, ushort, 256, TENSOR_REORDER_OCCUPANCY) -DEFINE_GENERAL_4D_REORDER_KERNEL(4x256, r2301, half, ushort, 256, TENSOR_REORDER_OCCUPANCY) -DEFINE_GENERAL_4D_REORDER_KERNEL(4x256, r2310, half, ushort, 256, TENSOR_REORDER_OCCUPANCY) -DEFINE_GENERAL_4D_REORDER_KERNEL(4x256, r3012, half, ushort, 256, TENSOR_REORDER_OCCUPANCY) -DEFINE_GENERAL_4D_REORDER_KERNEL(4x256, r3021, half, ushort, 256, TENSOR_REORDER_OCCUPANCY) -DEFINE_GENERAL_4D_REORDER_KERNEL(4x256, r3102, half, ushort, 256, TENSOR_REORDER_OCCUPANCY) -DEFINE_GENERAL_4D_REORDER_KERNEL(4x256, r3120, half, ushort, 256, TENSOR_REORDER_OCCUPANCY) -DEFINE_GENERAL_4D_REORDER_KERNEL(4x256, r3201, half, ushort, 256, TENSOR_REORDER_OCCUPANCY) -DEFINE_GENERAL_4D_REORDER_KERNEL(4x256, r3210, half, ushort, 256, TENSOR_REORDER_OCCUPANCY) - -DEFINE_GENERAL_4D_REORDER_KERNEL(8x256, r0132, half, ushort, 256, TENSOR_REORDER_OCCUPANCY) -DEFINE_GENERAL_4D_REORDER_KERNEL(8x256, r0213, half, ushort, 256, TENSOR_REORDER_OCCUPANCY) -DEFINE_GENERAL_4D_REORDER_KERNEL(8x256, r0231, half, ushort, 256, TENSOR_REORDER_OCCUPANCY) -DEFINE_GENERAL_4D_REORDER_KERNEL(8x256, r0312, half, ushort, 256, TENSOR_REORDER_OCCUPANCY) -DEFINE_GENERAL_4D_REORDER_KERNEL(8x256, r0321, half, ushort, 256, TENSOR_REORDER_OCCUPANCY) -DEFINE_GENERAL_4D_REORDER_KERNEL(8x256, r1023, half, ushort, 256, TENSOR_REORDER_OCCUPANCY) -DEFINE_GENERAL_4D_REORDER_KERNEL(8x256, r1032, half, ushort, 256, TENSOR_REORDER_OCCUPANCY) -DEFINE_GENERAL_4D_REORDER_KERNEL(8x256, r1203, half, ushort, 256, TENSOR_REORDER_OCCUPANCY) -DEFINE_GENERAL_4D_REORDER_KERNEL(8x256, r1230, half, ushort, 256, TENSOR_REORDER_OCCUPANCY) -DEFINE_GENERAL_4D_REORDER_KERNEL(8x256, r1302, half, ushort, 256, TENSOR_REORDER_OCCUPANCY) -DEFINE_GENERAL_4D_REORDER_KERNEL(8x256, r1320, half, ushort, 256, TENSOR_REORDER_OCCUPANCY) -DEFINE_GENERAL_4D_REORDER_KERNEL(8x256, r2013, half, ushort, 256, TENSOR_REORDER_OCCUPANCY) -DEFINE_GENERAL_4D_REORDER_KERNEL(8x256, r2031, half, ushort, 256, TENSOR_REORDER_OCCUPANCY) -DEFINE_GENERAL_4D_REORDER_KERNEL(8x256, r2103, half, ushort, 256, TENSOR_REORDER_OCCUPANCY) -DEFINE_GENERAL_4D_REORDER_KERNEL(8x256, r2130, half, ushort, 256, TENSOR_REORDER_OCCUPANCY) -DEFINE_GENERAL_4D_REORDER_KERNEL(8x256, r2301, half, ushort, 256, TENSOR_REORDER_OCCUPANCY) -DEFINE_GENERAL_4D_REORDER_KERNEL(8x256, r2310, half, ushort, 256, TENSOR_REORDER_OCCUPANCY) -DEFINE_GENERAL_4D_REORDER_KERNEL(8x256, r3012, half, ushort, 256, TENSOR_REORDER_OCCUPANCY) -DEFINE_GENERAL_4D_REORDER_KERNEL(8x256, r3021, half, ushort, 256, TENSOR_REORDER_OCCUPANCY) -DEFINE_GENERAL_4D_REORDER_KERNEL(8x256, r3102, half, ushort, 256, TENSOR_REORDER_OCCUPANCY) -DEFINE_GENERAL_4D_REORDER_KERNEL(8x256, r3120, half, ushort, 256, TENSOR_REORDER_OCCUPANCY) -DEFINE_GENERAL_4D_REORDER_KERNEL(8x256, r3201, half, ushort, 256, TENSOR_REORDER_OCCUPANCY) -DEFINE_GENERAL_4D_REORDER_KERNEL(8x256, r3210, half, ushort, 256, TENSOR_REORDER_OCCUPANCY) - -DEFINE_GENERAL_4D_REORDER_KERNEL(16x256, r0132, half, ushort, 256, TENSOR_REORDER_OCCUPANCY) -DEFINE_GENERAL_4D_REORDER_KERNEL(16x256, r0213, half, ushort, 256, TENSOR_REORDER_OCCUPANCY) -DEFINE_GENERAL_4D_REORDER_KERNEL(16x256, r0231, half, ushort, 256, TENSOR_REORDER_OCCUPANCY) -DEFINE_GENERAL_4D_REORDER_KERNEL(16x256, r0312, half, ushort, 256, TENSOR_REORDER_OCCUPANCY) -DEFINE_GENERAL_4D_REORDER_KERNEL(16x256, r0321, half, ushort, 256, TENSOR_REORDER_OCCUPANCY) -DEFINE_GENERAL_4D_REORDER_KERNEL(16x256, r1023, half, ushort, 256, TENSOR_REORDER_OCCUPANCY) -DEFINE_GENERAL_4D_REORDER_KERNEL(16x256, r1032, half, ushort, 256, TENSOR_REORDER_OCCUPANCY) -DEFINE_GENERAL_4D_REORDER_KERNEL(16x256, r1203, half, ushort, 256, TENSOR_REORDER_OCCUPANCY) -DEFINE_GENERAL_4D_REORDER_KERNEL(16x256, r1230, half, ushort, 256, TENSOR_REORDER_OCCUPANCY) -DEFINE_GENERAL_4D_REORDER_KERNEL(16x256, r1302, half, ushort, 256, TENSOR_REORDER_OCCUPANCY) -DEFINE_GENERAL_4D_REORDER_KERNEL(16x256, r1320, half, ushort, 256, TENSOR_REORDER_OCCUPANCY) -DEFINE_GENERAL_4D_REORDER_KERNEL(16x256, r2013, half, ushort, 256, TENSOR_REORDER_OCCUPANCY) -DEFINE_GENERAL_4D_REORDER_KERNEL(16x256, r2031, half, ushort, 256, TENSOR_REORDER_OCCUPANCY) -DEFINE_GENERAL_4D_REORDER_KERNEL(16x256, r2103, half, ushort, 256, TENSOR_REORDER_OCCUPANCY) -DEFINE_GENERAL_4D_REORDER_KERNEL(16x256, r2130, half, ushort, 256, TENSOR_REORDER_OCCUPANCY) -DEFINE_GENERAL_4D_REORDER_KERNEL(16x256, r2301, half, ushort, 256, TENSOR_REORDER_OCCUPANCY) -DEFINE_GENERAL_4D_REORDER_KERNEL(16x256, r2310, half, ushort, 256, TENSOR_REORDER_OCCUPANCY) -DEFINE_GENERAL_4D_REORDER_KERNEL(16x256, r3012, half, ushort, 256, TENSOR_REORDER_OCCUPANCY) -DEFINE_GENERAL_4D_REORDER_KERNEL(16x256, r3021, half, ushort, 256, TENSOR_REORDER_OCCUPANCY) -DEFINE_GENERAL_4D_REORDER_KERNEL(16x256, r3102, half, ushort, 256, TENSOR_REORDER_OCCUPANCY) -DEFINE_GENERAL_4D_REORDER_KERNEL(16x256, r3120, half, ushort, 256, TENSOR_REORDER_OCCUPANCY) -DEFINE_GENERAL_4D_REORDER_KERNEL(16x256, r3201, half, ushort, 256, TENSOR_REORDER_OCCUPANCY) -DEFINE_GENERAL_4D_REORDER_KERNEL(16x256, r3210, half, ushort, 256, TENSOR_REORDER_OCCUPANCY) - -DEFINE_GENERAL_4D_REORDER_KERNEL(1x256, r0132, byte, uchar, 256, TENSOR_REORDER_OCCUPANCY) -DEFINE_GENERAL_4D_REORDER_KERNEL(1x256, r0213, byte, uchar, 256, TENSOR_REORDER_OCCUPANCY) -DEFINE_GENERAL_4D_REORDER_KERNEL(1x256, r0231, byte, uchar, 256, TENSOR_REORDER_OCCUPANCY) -DEFINE_GENERAL_4D_REORDER_KERNEL(1x256, r0312, byte, uchar, 256, TENSOR_REORDER_OCCUPANCY) -DEFINE_GENERAL_4D_REORDER_KERNEL(1x256, r0321, byte, uchar, 256, TENSOR_REORDER_OCCUPANCY) -DEFINE_GENERAL_4D_REORDER_KERNEL(1x256, r1023, byte, uchar, 256, TENSOR_REORDER_OCCUPANCY) -DEFINE_GENERAL_4D_REORDER_KERNEL(1x256, r1032, byte, uchar, 256, TENSOR_REORDER_OCCUPANCY) -DEFINE_GENERAL_4D_REORDER_KERNEL(1x256, r1203, byte, uchar, 256, TENSOR_REORDER_OCCUPANCY) -DEFINE_GENERAL_4D_REORDER_KERNEL(1x256, r1230, byte, uchar, 256, TENSOR_REORDER_OCCUPANCY) -DEFINE_GENERAL_4D_REORDER_KERNEL(1x256, r1302, byte, uchar, 256, TENSOR_REORDER_OCCUPANCY) -DEFINE_GENERAL_4D_REORDER_KERNEL(1x256, r1320, byte, uchar, 256, TENSOR_REORDER_OCCUPANCY) -DEFINE_GENERAL_4D_REORDER_KERNEL(1x256, r2013, byte, uchar, 256, TENSOR_REORDER_OCCUPANCY) -DEFINE_GENERAL_4D_REORDER_KERNEL(1x256, r2031, byte, uchar, 256, TENSOR_REORDER_OCCUPANCY) -DEFINE_GENERAL_4D_REORDER_KERNEL(1x256, r2103, byte, uchar, 256, TENSOR_REORDER_OCCUPANCY) -DEFINE_GENERAL_4D_REORDER_KERNEL(1x256, r2130, byte, uchar, 256, TENSOR_REORDER_OCCUPANCY) -DEFINE_GENERAL_4D_REORDER_KERNEL(1x256, r2301, byte, uchar, 256, TENSOR_REORDER_OCCUPANCY) -DEFINE_GENERAL_4D_REORDER_KERNEL(1x256, r2310, byte, uchar, 256, TENSOR_REORDER_OCCUPANCY) -DEFINE_GENERAL_4D_REORDER_KERNEL(1x256, r3012, byte, uchar, 256, TENSOR_REORDER_OCCUPANCY) -DEFINE_GENERAL_4D_REORDER_KERNEL(1x256, r3021, byte, uchar, 256, TENSOR_REORDER_OCCUPANCY) -DEFINE_GENERAL_4D_REORDER_KERNEL(1x256, r3102, byte, uchar, 256, TENSOR_REORDER_OCCUPANCY) -DEFINE_GENERAL_4D_REORDER_KERNEL(1x256, r3120, byte, uchar, 256, TENSOR_REORDER_OCCUPANCY) -DEFINE_GENERAL_4D_REORDER_KERNEL(1x256, r3201, byte, uchar, 256, TENSOR_REORDER_OCCUPANCY) -DEFINE_GENERAL_4D_REORDER_KERNEL(1x256, r3210, byte, uchar, 256, TENSOR_REORDER_OCCUPANCY) - -DEFINE_GENERAL_4D_REORDER_KERNEL(2x256, r0132, byte, uchar, 256, TENSOR_REORDER_OCCUPANCY) -DEFINE_GENERAL_4D_REORDER_KERNEL(2x256, r0213, byte, uchar, 256, TENSOR_REORDER_OCCUPANCY) -DEFINE_GENERAL_4D_REORDER_KERNEL(2x256, r0231, byte, uchar, 256, TENSOR_REORDER_OCCUPANCY) -DEFINE_GENERAL_4D_REORDER_KERNEL(2x256, r0312, byte, uchar, 256, TENSOR_REORDER_OCCUPANCY) -DEFINE_GENERAL_4D_REORDER_KERNEL(2x256, r0321, byte, uchar, 256, TENSOR_REORDER_OCCUPANCY) -DEFINE_GENERAL_4D_REORDER_KERNEL(2x256, r1023, byte, uchar, 256, TENSOR_REORDER_OCCUPANCY) -DEFINE_GENERAL_4D_REORDER_KERNEL(2x256, r1032, byte, uchar, 256, TENSOR_REORDER_OCCUPANCY) -DEFINE_GENERAL_4D_REORDER_KERNEL(2x256, r1203, byte, uchar, 256, TENSOR_REORDER_OCCUPANCY) -DEFINE_GENERAL_4D_REORDER_KERNEL(2x256, r1230, byte, uchar, 256, TENSOR_REORDER_OCCUPANCY) -DEFINE_GENERAL_4D_REORDER_KERNEL(2x256, r1302, byte, uchar, 256, TENSOR_REORDER_OCCUPANCY) -DEFINE_GENERAL_4D_REORDER_KERNEL(2x256, r1320, byte, uchar, 256, TENSOR_REORDER_OCCUPANCY) -DEFINE_GENERAL_4D_REORDER_KERNEL(2x256, r2013, byte, uchar, 256, TENSOR_REORDER_OCCUPANCY) -DEFINE_GENERAL_4D_REORDER_KERNEL(2x256, r2031, byte, uchar, 256, TENSOR_REORDER_OCCUPANCY) -DEFINE_GENERAL_4D_REORDER_KERNEL(2x256, r2103, byte, uchar, 256, TENSOR_REORDER_OCCUPANCY) -DEFINE_GENERAL_4D_REORDER_KERNEL(2x256, r2130, byte, uchar, 256, TENSOR_REORDER_OCCUPANCY) -DEFINE_GENERAL_4D_REORDER_KERNEL(2x256, r2301, byte, uchar, 256, TENSOR_REORDER_OCCUPANCY) -DEFINE_GENERAL_4D_REORDER_KERNEL(2x256, r2310, byte, uchar, 256, TENSOR_REORDER_OCCUPANCY) -DEFINE_GENERAL_4D_REORDER_KERNEL(2x256, r3012, byte, uchar, 256, TENSOR_REORDER_OCCUPANCY) -DEFINE_GENERAL_4D_REORDER_KERNEL(2x256, r3021, byte, uchar, 256, TENSOR_REORDER_OCCUPANCY) -DEFINE_GENERAL_4D_REORDER_KERNEL(2x256, r3102, byte, uchar, 256, TENSOR_REORDER_OCCUPANCY) -DEFINE_GENERAL_4D_REORDER_KERNEL(2x256, r3120, byte, uchar, 256, TENSOR_REORDER_OCCUPANCY) -DEFINE_GENERAL_4D_REORDER_KERNEL(2x256, r3201, byte, uchar, 256, TENSOR_REORDER_OCCUPANCY) -DEFINE_GENERAL_4D_REORDER_KERNEL(2x256, r3210, byte, uchar, 256, TENSOR_REORDER_OCCUPANCY) - -DEFINE_GENERAL_4D_REORDER_KERNEL(4x256, r0132, byte, uchar, 256, TENSOR_REORDER_OCCUPANCY) -DEFINE_GENERAL_4D_REORDER_KERNEL(4x256, r0213, byte, uchar, 256, TENSOR_REORDER_OCCUPANCY) -DEFINE_GENERAL_4D_REORDER_KERNEL(4x256, r0231, byte, uchar, 256, TENSOR_REORDER_OCCUPANCY) -DEFINE_GENERAL_4D_REORDER_KERNEL(4x256, r0312, byte, uchar, 256, TENSOR_REORDER_OCCUPANCY) -DEFINE_GENERAL_4D_REORDER_KERNEL(4x256, r0321, byte, uchar, 256, TENSOR_REORDER_OCCUPANCY) -DEFINE_GENERAL_4D_REORDER_KERNEL(4x256, r1023, byte, uchar, 256, TENSOR_REORDER_OCCUPANCY) -DEFINE_GENERAL_4D_REORDER_KERNEL(4x256, r1032, byte, uchar, 256, TENSOR_REORDER_OCCUPANCY) -DEFINE_GENERAL_4D_REORDER_KERNEL(4x256, r1203, byte, uchar, 256, TENSOR_REORDER_OCCUPANCY) -DEFINE_GENERAL_4D_REORDER_KERNEL(4x256, r1230, byte, uchar, 256, TENSOR_REORDER_OCCUPANCY) -DEFINE_GENERAL_4D_REORDER_KERNEL(4x256, r1302, byte, uchar, 256, TENSOR_REORDER_OCCUPANCY) -DEFINE_GENERAL_4D_REORDER_KERNEL(4x256, r1320, byte, uchar, 256, TENSOR_REORDER_OCCUPANCY) -DEFINE_GENERAL_4D_REORDER_KERNEL(4x256, r2013, byte, uchar, 256, TENSOR_REORDER_OCCUPANCY) -DEFINE_GENERAL_4D_REORDER_KERNEL(4x256, r2031, byte, uchar, 256, TENSOR_REORDER_OCCUPANCY) -DEFINE_GENERAL_4D_REORDER_KERNEL(4x256, r2103, byte, uchar, 256, TENSOR_REORDER_OCCUPANCY) -DEFINE_GENERAL_4D_REORDER_KERNEL(4x256, r2130, byte, uchar, 256, TENSOR_REORDER_OCCUPANCY) -DEFINE_GENERAL_4D_REORDER_KERNEL(4x256, r2301, byte, uchar, 256, TENSOR_REORDER_OCCUPANCY) -DEFINE_GENERAL_4D_REORDER_KERNEL(4x256, r2310, byte, uchar, 256, TENSOR_REORDER_OCCUPANCY) -DEFINE_GENERAL_4D_REORDER_KERNEL(4x256, r3012, byte, uchar, 256, TENSOR_REORDER_OCCUPANCY) -DEFINE_GENERAL_4D_REORDER_KERNEL(4x256, r3021, byte, uchar, 256, TENSOR_REORDER_OCCUPANCY) -DEFINE_GENERAL_4D_REORDER_KERNEL(4x256, r3102, byte, uchar, 256, TENSOR_REORDER_OCCUPANCY) -DEFINE_GENERAL_4D_REORDER_KERNEL(4x256, r3120, byte, uchar, 256, TENSOR_REORDER_OCCUPANCY) -DEFINE_GENERAL_4D_REORDER_KERNEL(4x256, r3201, byte, uchar, 256, TENSOR_REORDER_OCCUPANCY) -DEFINE_GENERAL_4D_REORDER_KERNEL(4x256, r3210, byte, uchar, 256, TENSOR_REORDER_OCCUPANCY) - -DEFINE_GENERAL_4D_REORDER_KERNEL(8x256, r0132, byte, uchar, 256, TENSOR_REORDER_OCCUPANCY) -DEFINE_GENERAL_4D_REORDER_KERNEL(8x256, r0213, byte, uchar, 256, TENSOR_REORDER_OCCUPANCY) -DEFINE_GENERAL_4D_REORDER_KERNEL(8x256, r0231, byte, uchar, 256, TENSOR_REORDER_OCCUPANCY) -DEFINE_GENERAL_4D_REORDER_KERNEL(8x256, r0312, byte, uchar, 256, TENSOR_REORDER_OCCUPANCY) -DEFINE_GENERAL_4D_REORDER_KERNEL(8x256, r0321, byte, uchar, 256, TENSOR_REORDER_OCCUPANCY) -DEFINE_GENERAL_4D_REORDER_KERNEL(8x256, r1023, byte, uchar, 256, TENSOR_REORDER_OCCUPANCY) -DEFINE_GENERAL_4D_REORDER_KERNEL(8x256, r1032, byte, uchar, 256, TENSOR_REORDER_OCCUPANCY) -DEFINE_GENERAL_4D_REORDER_KERNEL(8x256, r1203, byte, uchar, 256, TENSOR_REORDER_OCCUPANCY) -DEFINE_GENERAL_4D_REORDER_KERNEL(8x256, r1230, byte, uchar, 256, TENSOR_REORDER_OCCUPANCY) -DEFINE_GENERAL_4D_REORDER_KERNEL(8x256, r1302, byte, uchar, 256, TENSOR_REORDER_OCCUPANCY) -DEFINE_GENERAL_4D_REORDER_KERNEL(8x256, r1320, byte, uchar, 256, TENSOR_REORDER_OCCUPANCY) -DEFINE_GENERAL_4D_REORDER_KERNEL(8x256, r2013, byte, uchar, 256, TENSOR_REORDER_OCCUPANCY) -DEFINE_GENERAL_4D_REORDER_KERNEL(8x256, r2031, byte, uchar, 256, TENSOR_REORDER_OCCUPANCY) -DEFINE_GENERAL_4D_REORDER_KERNEL(8x256, r2103, byte, uchar, 256, TENSOR_REORDER_OCCUPANCY) -DEFINE_GENERAL_4D_REORDER_KERNEL(8x256, r2130, byte, uchar, 256, TENSOR_REORDER_OCCUPANCY) -DEFINE_GENERAL_4D_REORDER_KERNEL(8x256, r2301, byte, uchar, 256, TENSOR_REORDER_OCCUPANCY) -DEFINE_GENERAL_4D_REORDER_KERNEL(8x256, r2310, byte, uchar, 256, TENSOR_REORDER_OCCUPANCY) -DEFINE_GENERAL_4D_REORDER_KERNEL(8x256, r3012, byte, uchar, 256, TENSOR_REORDER_OCCUPANCY) -DEFINE_GENERAL_4D_REORDER_KERNEL(8x256, r3021, byte, uchar, 256, TENSOR_REORDER_OCCUPANCY) -DEFINE_GENERAL_4D_REORDER_KERNEL(8x256, r3102, byte, uchar, 256, TENSOR_REORDER_OCCUPANCY) -DEFINE_GENERAL_4D_REORDER_KERNEL(8x256, r3120, byte, uchar, 256, TENSOR_REORDER_OCCUPANCY) -DEFINE_GENERAL_4D_REORDER_KERNEL(8x256, r3201, byte, uchar, 256, TENSOR_REORDER_OCCUPANCY) -DEFINE_GENERAL_4D_REORDER_KERNEL(8x256, r3210, byte, uchar, 256, TENSOR_REORDER_OCCUPANCY) - -DEFINE_GENERAL_4D_REORDER_KERNEL(16x256, r0132, byte, uchar, 256, TENSOR_REORDER_OCCUPANCY) -DEFINE_GENERAL_4D_REORDER_KERNEL(16x256, r0213, byte, uchar, 256, TENSOR_REORDER_OCCUPANCY) -DEFINE_GENERAL_4D_REORDER_KERNEL(16x256, r0231, byte, uchar, 256, TENSOR_REORDER_OCCUPANCY) -DEFINE_GENERAL_4D_REORDER_KERNEL(16x256, r0312, byte, uchar, 256, TENSOR_REORDER_OCCUPANCY) -DEFINE_GENERAL_4D_REORDER_KERNEL(16x256, r0321, byte, uchar, 256, TENSOR_REORDER_OCCUPANCY) -DEFINE_GENERAL_4D_REORDER_KERNEL(16x256, r1023, byte, uchar, 256, TENSOR_REORDER_OCCUPANCY) -DEFINE_GENERAL_4D_REORDER_KERNEL(16x256, r1032, byte, uchar, 256, TENSOR_REORDER_OCCUPANCY) -DEFINE_GENERAL_4D_REORDER_KERNEL(16x256, r1203, byte, uchar, 256, TENSOR_REORDER_OCCUPANCY) -DEFINE_GENERAL_4D_REORDER_KERNEL(16x256, r1230, byte, uchar, 256, TENSOR_REORDER_OCCUPANCY) -DEFINE_GENERAL_4D_REORDER_KERNEL(16x256, r1302, byte, uchar, 256, TENSOR_REORDER_OCCUPANCY) -DEFINE_GENERAL_4D_REORDER_KERNEL(16x256, r1320, byte, uchar, 256, TENSOR_REORDER_OCCUPANCY) -DEFINE_GENERAL_4D_REORDER_KERNEL(16x256, r2013, byte, uchar, 256, TENSOR_REORDER_OCCUPANCY) -DEFINE_GENERAL_4D_REORDER_KERNEL(16x256, r2031, byte, uchar, 256, TENSOR_REORDER_OCCUPANCY) -DEFINE_GENERAL_4D_REORDER_KERNEL(16x256, r2103, byte, uchar, 256, TENSOR_REORDER_OCCUPANCY) -DEFINE_GENERAL_4D_REORDER_KERNEL(16x256, r2130, byte, uchar, 256, TENSOR_REORDER_OCCUPANCY) -DEFINE_GENERAL_4D_REORDER_KERNEL(16x256, r2301, byte, uchar, 256, TENSOR_REORDER_OCCUPANCY) -DEFINE_GENERAL_4D_REORDER_KERNEL(16x256, r2310, byte, uchar, 256, TENSOR_REORDER_OCCUPANCY) -DEFINE_GENERAL_4D_REORDER_KERNEL(16x256, r3012, byte, uchar, 256, TENSOR_REORDER_OCCUPANCY) -DEFINE_GENERAL_4D_REORDER_KERNEL(16x256, r3021, byte, uchar, 256, TENSOR_REORDER_OCCUPANCY) -DEFINE_GENERAL_4D_REORDER_KERNEL(16x256, r3102, byte, uchar, 256, TENSOR_REORDER_OCCUPANCY) -DEFINE_GENERAL_4D_REORDER_KERNEL(16x256, r3120, byte, uchar, 256, TENSOR_REORDER_OCCUPANCY) -DEFINE_GENERAL_4D_REORDER_KERNEL(16x256, r3201, byte, uchar, 256, TENSOR_REORDER_OCCUPANCY) -DEFINE_GENERAL_4D_REORDER_KERNEL(16x256, r3210, byte, uchar, 256, TENSOR_REORDER_OCCUPANCY) diff --git a/src/kernels/gpu_general_tensor_reorder_kernel/general_tensor_reorder_16x256_byte.cpp b/src/kernels/gpu_general_tensor_reorder_kernel/general_tensor_reorder_16x256_byte.cpp new file mode 100644 index 0000000000..dc39823ab8 --- /dev/null +++ b/src/kernels/gpu_general_tensor_reorder_kernel/general_tensor_reorder_16x256_byte.cpp @@ -0,0 +1,49 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2020-2022 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +#include "general_tensor_reorder_kernel_util.hpp" +DEFINE_GENERAL_4D_REORDER_KERNEL(16x256, r0132, byte, uchar, 256, TENSOR_REORDER_OCCUPANCY) +DEFINE_GENERAL_4D_REORDER_KERNEL(16x256, r0213, byte, uchar, 256, TENSOR_REORDER_OCCUPANCY) +DEFINE_GENERAL_4D_REORDER_KERNEL(16x256, r0231, byte, uchar, 256, TENSOR_REORDER_OCCUPANCY) +DEFINE_GENERAL_4D_REORDER_KERNEL(16x256, r0312, byte, uchar, 256, TENSOR_REORDER_OCCUPANCY) +DEFINE_GENERAL_4D_REORDER_KERNEL(16x256, r0321, byte, uchar, 256, TENSOR_REORDER_OCCUPANCY) +DEFINE_GENERAL_4D_REORDER_KERNEL(16x256, r1023, byte, uchar, 256, TENSOR_REORDER_OCCUPANCY) +DEFINE_GENERAL_4D_REORDER_KERNEL(16x256, r1032, byte, uchar, 256, TENSOR_REORDER_OCCUPANCY) +DEFINE_GENERAL_4D_REORDER_KERNEL(16x256, r1203, byte, uchar, 256, TENSOR_REORDER_OCCUPANCY) +DEFINE_GENERAL_4D_REORDER_KERNEL(16x256, r1230, byte, uchar, 256, TENSOR_REORDER_OCCUPANCY) +DEFINE_GENERAL_4D_REORDER_KERNEL(16x256, r1302, byte, uchar, 256, TENSOR_REORDER_OCCUPANCY) +DEFINE_GENERAL_4D_REORDER_KERNEL(16x256, r1320, byte, uchar, 256, TENSOR_REORDER_OCCUPANCY) +DEFINE_GENERAL_4D_REORDER_KERNEL(16x256, r2013, byte, uchar, 256, TENSOR_REORDER_OCCUPANCY) +DEFINE_GENERAL_4D_REORDER_KERNEL(16x256, r2031, byte, uchar, 256, TENSOR_REORDER_OCCUPANCY) +DEFINE_GENERAL_4D_REORDER_KERNEL(16x256, r2103, byte, uchar, 256, TENSOR_REORDER_OCCUPANCY) +DEFINE_GENERAL_4D_REORDER_KERNEL(16x256, r2130, byte, uchar, 256, TENSOR_REORDER_OCCUPANCY) +DEFINE_GENERAL_4D_REORDER_KERNEL(16x256, r2301, byte, uchar, 256, TENSOR_REORDER_OCCUPANCY) +DEFINE_GENERAL_4D_REORDER_KERNEL(16x256, r2310, byte, uchar, 256, TENSOR_REORDER_OCCUPANCY) +DEFINE_GENERAL_4D_REORDER_KERNEL(16x256, r3012, byte, uchar, 256, TENSOR_REORDER_OCCUPANCY) +DEFINE_GENERAL_4D_REORDER_KERNEL(16x256, r3021, byte, uchar, 256, TENSOR_REORDER_OCCUPANCY) +DEFINE_GENERAL_4D_REORDER_KERNEL(16x256, r3102, byte, uchar, 256, TENSOR_REORDER_OCCUPANCY) +DEFINE_GENERAL_4D_REORDER_KERNEL(16x256, r3120, byte, uchar, 256, TENSOR_REORDER_OCCUPANCY) +DEFINE_GENERAL_4D_REORDER_KERNEL(16x256, r3201, byte, uchar, 256, TENSOR_REORDER_OCCUPANCY) +DEFINE_GENERAL_4D_REORDER_KERNEL(16x256, r3210, byte, uchar, 256, TENSOR_REORDER_OCCUPANCY) diff --git a/src/kernels/gpu_general_tensor_reorder_kernel/general_tensor_reorder_16x256_dword.cpp b/src/kernels/gpu_general_tensor_reorder_kernel/general_tensor_reorder_16x256_dword.cpp new file mode 100644 index 0000000000..fd868304ae --- /dev/null +++ b/src/kernels/gpu_general_tensor_reorder_kernel/general_tensor_reorder_16x256_dword.cpp @@ -0,0 +1,49 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2020-2022 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +#include "general_tensor_reorder_kernel_util.hpp" +DEFINE_GENERAL_4D_REORDER_KERNEL(16x256, r0132, dword, float, 256, TENSOR_REORDER_OCCUPANCY) +DEFINE_GENERAL_4D_REORDER_KERNEL(16x256, r0213, dword, float, 256, TENSOR_REORDER_OCCUPANCY) +DEFINE_GENERAL_4D_REORDER_KERNEL(16x256, r0231, dword, float, 256, TENSOR_REORDER_OCCUPANCY) +DEFINE_GENERAL_4D_REORDER_KERNEL(16x256, r0312, dword, float, 256, TENSOR_REORDER_OCCUPANCY) +DEFINE_GENERAL_4D_REORDER_KERNEL(16x256, r0321, dword, float, 256, TENSOR_REORDER_OCCUPANCY) +DEFINE_GENERAL_4D_REORDER_KERNEL(16x256, r1023, dword, float, 256, TENSOR_REORDER_OCCUPANCY) +DEFINE_GENERAL_4D_REORDER_KERNEL(16x256, r1032, dword, float, 256, TENSOR_REORDER_OCCUPANCY) +DEFINE_GENERAL_4D_REORDER_KERNEL(16x256, r1203, dword, float, 256, TENSOR_REORDER_OCCUPANCY) +DEFINE_GENERAL_4D_REORDER_KERNEL(16x256, r1230, dword, float, 256, TENSOR_REORDER_OCCUPANCY) +DEFINE_GENERAL_4D_REORDER_KERNEL(16x256, r1302, dword, float, 256, TENSOR_REORDER_OCCUPANCY) +DEFINE_GENERAL_4D_REORDER_KERNEL(16x256, r1320, dword, float, 256, TENSOR_REORDER_OCCUPANCY) +DEFINE_GENERAL_4D_REORDER_KERNEL(16x256, r2013, dword, float, 256, TENSOR_REORDER_OCCUPANCY) +DEFINE_GENERAL_4D_REORDER_KERNEL(16x256, r2031, dword, float, 256, TENSOR_REORDER_OCCUPANCY) +DEFINE_GENERAL_4D_REORDER_KERNEL(16x256, r2103, dword, float, 256, TENSOR_REORDER_OCCUPANCY) +DEFINE_GENERAL_4D_REORDER_KERNEL(16x256, r2130, dword, float, 256, TENSOR_REORDER_OCCUPANCY) +DEFINE_GENERAL_4D_REORDER_KERNEL(16x256, r2301, dword, float, 256, TENSOR_REORDER_OCCUPANCY) +DEFINE_GENERAL_4D_REORDER_KERNEL(16x256, r2310, dword, float, 256, TENSOR_REORDER_OCCUPANCY) +DEFINE_GENERAL_4D_REORDER_KERNEL(16x256, r3012, dword, float, 256, TENSOR_REORDER_OCCUPANCY) +DEFINE_GENERAL_4D_REORDER_KERNEL(16x256, r3021, dword, float, 256, TENSOR_REORDER_OCCUPANCY) +DEFINE_GENERAL_4D_REORDER_KERNEL(16x256, r3102, dword, float, 256, TENSOR_REORDER_OCCUPANCY) +DEFINE_GENERAL_4D_REORDER_KERNEL(16x256, r3120, dword, float, 256, TENSOR_REORDER_OCCUPANCY) +DEFINE_GENERAL_4D_REORDER_KERNEL(16x256, r3201, dword, float, 256, TENSOR_REORDER_OCCUPANCY) +DEFINE_GENERAL_4D_REORDER_KERNEL(16x256, r3210, dword, float, 256, TENSOR_REORDER_OCCUPANCY) diff --git a/src/kernels/gpu_general_tensor_reorder_kernel/general_tensor_reorder_16x256_dwordx2.cpp b/src/kernels/gpu_general_tensor_reorder_kernel/general_tensor_reorder_16x256_dwordx2.cpp new file mode 100644 index 0000000000..fc909017b4 --- /dev/null +++ b/src/kernels/gpu_general_tensor_reorder_kernel/general_tensor_reorder_16x256_dwordx2.cpp @@ -0,0 +1,49 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2020-2022 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +#include "general_tensor_reorder_kernel_util.hpp" +DEFINE_GENERAL_4D_REORDER_KERNEL(16x256, r0132, dwordx2, double, 256, TENSOR_REORDER_OCCUPANCY) +DEFINE_GENERAL_4D_REORDER_KERNEL(16x256, r0213, dwordx2, double, 256, TENSOR_REORDER_OCCUPANCY) +DEFINE_GENERAL_4D_REORDER_KERNEL(16x256, r0231, dwordx2, double, 256, TENSOR_REORDER_OCCUPANCY) +DEFINE_GENERAL_4D_REORDER_KERNEL(16x256, r0312, dwordx2, double, 256, TENSOR_REORDER_OCCUPANCY) +DEFINE_GENERAL_4D_REORDER_KERNEL(16x256, r0321, dwordx2, double, 256, TENSOR_REORDER_OCCUPANCY) +DEFINE_GENERAL_4D_REORDER_KERNEL(16x256, r1023, dwordx2, double, 256, TENSOR_REORDER_OCCUPANCY) +DEFINE_GENERAL_4D_REORDER_KERNEL(16x256, r1032, dwordx2, double, 256, TENSOR_REORDER_OCCUPANCY) +DEFINE_GENERAL_4D_REORDER_KERNEL(16x256, r1203, dwordx2, double, 256, TENSOR_REORDER_OCCUPANCY) +DEFINE_GENERAL_4D_REORDER_KERNEL(16x256, r1230, dwordx2, double, 256, TENSOR_REORDER_OCCUPANCY) +DEFINE_GENERAL_4D_REORDER_KERNEL(16x256, r1302, dwordx2, double, 256, TENSOR_REORDER_OCCUPANCY) +DEFINE_GENERAL_4D_REORDER_KERNEL(16x256, r1320, dwordx2, double, 256, TENSOR_REORDER_OCCUPANCY) +DEFINE_GENERAL_4D_REORDER_KERNEL(16x256, r2013, dwordx2, double, 256, TENSOR_REORDER_OCCUPANCY) +DEFINE_GENERAL_4D_REORDER_KERNEL(16x256, r2031, dwordx2, double, 256, TENSOR_REORDER_OCCUPANCY) +DEFINE_GENERAL_4D_REORDER_KERNEL(16x256, r2103, dwordx2, double, 256, TENSOR_REORDER_OCCUPANCY) +DEFINE_GENERAL_4D_REORDER_KERNEL(16x256, r2130, dwordx2, double, 256, TENSOR_REORDER_OCCUPANCY) +DEFINE_GENERAL_4D_REORDER_KERNEL(16x256, r2301, dwordx2, double, 256, TENSOR_REORDER_OCCUPANCY) +DEFINE_GENERAL_4D_REORDER_KERNEL(16x256, r2310, dwordx2, double, 256, TENSOR_REORDER_OCCUPANCY) +DEFINE_GENERAL_4D_REORDER_KERNEL(16x256, r3012, dwordx2, double, 256, TENSOR_REORDER_OCCUPANCY) +DEFINE_GENERAL_4D_REORDER_KERNEL(16x256, r3021, dwordx2, double, 256, TENSOR_REORDER_OCCUPANCY) +DEFINE_GENERAL_4D_REORDER_KERNEL(16x256, r3102, dwordx2, double, 256, TENSOR_REORDER_OCCUPANCY) +DEFINE_GENERAL_4D_REORDER_KERNEL(16x256, r3120, dwordx2, double, 256, TENSOR_REORDER_OCCUPANCY) +DEFINE_GENERAL_4D_REORDER_KERNEL(16x256, r3201, dwordx2, double, 256, TENSOR_REORDER_OCCUPANCY) +DEFINE_GENERAL_4D_REORDER_KERNEL(16x256, r3210, dwordx2, double, 256, TENSOR_REORDER_OCCUPANCY) diff --git a/src/kernels/gpu_general_tensor_reorder_kernel/general_tensor_reorder_16x256_half.cpp b/src/kernels/gpu_general_tensor_reorder_kernel/general_tensor_reorder_16x256_half.cpp new file mode 100644 index 0000000000..28c8d3f166 --- /dev/null +++ b/src/kernels/gpu_general_tensor_reorder_kernel/general_tensor_reorder_16x256_half.cpp @@ -0,0 +1,49 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2020-2022 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +#include "general_tensor_reorder_kernel_util.hpp" +DEFINE_GENERAL_4D_REORDER_KERNEL(16x256, r0132, half, ushort, 256, TENSOR_REORDER_OCCUPANCY) +DEFINE_GENERAL_4D_REORDER_KERNEL(16x256, r0213, half, ushort, 256, TENSOR_REORDER_OCCUPANCY) +DEFINE_GENERAL_4D_REORDER_KERNEL(16x256, r0231, half, ushort, 256, TENSOR_REORDER_OCCUPANCY) +DEFINE_GENERAL_4D_REORDER_KERNEL(16x256, r0312, half, ushort, 256, TENSOR_REORDER_OCCUPANCY) +DEFINE_GENERAL_4D_REORDER_KERNEL(16x256, r0321, half, ushort, 256, TENSOR_REORDER_OCCUPANCY) +DEFINE_GENERAL_4D_REORDER_KERNEL(16x256, r1023, half, ushort, 256, TENSOR_REORDER_OCCUPANCY) +DEFINE_GENERAL_4D_REORDER_KERNEL(16x256, r1032, half, ushort, 256, TENSOR_REORDER_OCCUPANCY) +DEFINE_GENERAL_4D_REORDER_KERNEL(16x256, r1203, half, ushort, 256, TENSOR_REORDER_OCCUPANCY) +DEFINE_GENERAL_4D_REORDER_KERNEL(16x256, r1230, half, ushort, 256, TENSOR_REORDER_OCCUPANCY) +DEFINE_GENERAL_4D_REORDER_KERNEL(16x256, r1302, half, ushort, 256, TENSOR_REORDER_OCCUPANCY) +DEFINE_GENERAL_4D_REORDER_KERNEL(16x256, r1320, half, ushort, 256, TENSOR_REORDER_OCCUPANCY) +DEFINE_GENERAL_4D_REORDER_KERNEL(16x256, r2013, half, ushort, 256, TENSOR_REORDER_OCCUPANCY) +DEFINE_GENERAL_4D_REORDER_KERNEL(16x256, r2031, half, ushort, 256, TENSOR_REORDER_OCCUPANCY) +DEFINE_GENERAL_4D_REORDER_KERNEL(16x256, r2103, half, ushort, 256, TENSOR_REORDER_OCCUPANCY) +DEFINE_GENERAL_4D_REORDER_KERNEL(16x256, r2130, half, ushort, 256, TENSOR_REORDER_OCCUPANCY) +DEFINE_GENERAL_4D_REORDER_KERNEL(16x256, r2301, half, ushort, 256, TENSOR_REORDER_OCCUPANCY) +DEFINE_GENERAL_4D_REORDER_KERNEL(16x256, r2310, half, ushort, 256, TENSOR_REORDER_OCCUPANCY) +DEFINE_GENERAL_4D_REORDER_KERNEL(16x256, r3012, half, ushort, 256, TENSOR_REORDER_OCCUPANCY) +DEFINE_GENERAL_4D_REORDER_KERNEL(16x256, r3021, half, ushort, 256, TENSOR_REORDER_OCCUPANCY) +DEFINE_GENERAL_4D_REORDER_KERNEL(16x256, r3102, half, ushort, 256, TENSOR_REORDER_OCCUPANCY) +DEFINE_GENERAL_4D_REORDER_KERNEL(16x256, r3120, half, ushort, 256, TENSOR_REORDER_OCCUPANCY) +DEFINE_GENERAL_4D_REORDER_KERNEL(16x256, r3201, half, ushort, 256, TENSOR_REORDER_OCCUPANCY) +DEFINE_GENERAL_4D_REORDER_KERNEL(16x256, r3210, half, ushort, 256, TENSOR_REORDER_OCCUPANCY) diff --git a/src/kernels/gpu_general_tensor_reorder_kernel/general_tensor_reorder_1x256_byte.cpp b/src/kernels/gpu_general_tensor_reorder_kernel/general_tensor_reorder_1x256_byte.cpp new file mode 100644 index 0000000000..0c2c5d1109 --- /dev/null +++ b/src/kernels/gpu_general_tensor_reorder_kernel/general_tensor_reorder_1x256_byte.cpp @@ -0,0 +1,49 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2020-2022 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +#include "general_tensor_reorder_kernel_util.hpp" +DEFINE_GENERAL_4D_REORDER_KERNEL(1x256, r0132, byte, uchar, 256, TENSOR_REORDER_OCCUPANCY) +DEFINE_GENERAL_4D_REORDER_KERNEL(1x256, r0213, byte, uchar, 256, TENSOR_REORDER_OCCUPANCY) +DEFINE_GENERAL_4D_REORDER_KERNEL(1x256, r0231, byte, uchar, 256, TENSOR_REORDER_OCCUPANCY) +DEFINE_GENERAL_4D_REORDER_KERNEL(1x256, r0312, byte, uchar, 256, TENSOR_REORDER_OCCUPANCY) +DEFINE_GENERAL_4D_REORDER_KERNEL(1x256, r0321, byte, uchar, 256, TENSOR_REORDER_OCCUPANCY) +DEFINE_GENERAL_4D_REORDER_KERNEL(1x256, r1023, byte, uchar, 256, TENSOR_REORDER_OCCUPANCY) +DEFINE_GENERAL_4D_REORDER_KERNEL(1x256, r1032, byte, uchar, 256, TENSOR_REORDER_OCCUPANCY) +DEFINE_GENERAL_4D_REORDER_KERNEL(1x256, r1203, byte, uchar, 256, TENSOR_REORDER_OCCUPANCY) +DEFINE_GENERAL_4D_REORDER_KERNEL(1x256, r1230, byte, uchar, 256, TENSOR_REORDER_OCCUPANCY) +DEFINE_GENERAL_4D_REORDER_KERNEL(1x256, r1302, byte, uchar, 256, TENSOR_REORDER_OCCUPANCY) +DEFINE_GENERAL_4D_REORDER_KERNEL(1x256, r1320, byte, uchar, 256, TENSOR_REORDER_OCCUPANCY) +DEFINE_GENERAL_4D_REORDER_KERNEL(1x256, r2013, byte, uchar, 256, TENSOR_REORDER_OCCUPANCY) +DEFINE_GENERAL_4D_REORDER_KERNEL(1x256, r2031, byte, uchar, 256, TENSOR_REORDER_OCCUPANCY) +DEFINE_GENERAL_4D_REORDER_KERNEL(1x256, r2103, byte, uchar, 256, TENSOR_REORDER_OCCUPANCY) +DEFINE_GENERAL_4D_REORDER_KERNEL(1x256, r2130, byte, uchar, 256, TENSOR_REORDER_OCCUPANCY) +DEFINE_GENERAL_4D_REORDER_KERNEL(1x256, r2301, byte, uchar, 256, TENSOR_REORDER_OCCUPANCY) +DEFINE_GENERAL_4D_REORDER_KERNEL(1x256, r2310, byte, uchar, 256, TENSOR_REORDER_OCCUPANCY) +DEFINE_GENERAL_4D_REORDER_KERNEL(1x256, r3012, byte, uchar, 256, TENSOR_REORDER_OCCUPANCY) +DEFINE_GENERAL_4D_REORDER_KERNEL(1x256, r3021, byte, uchar, 256, TENSOR_REORDER_OCCUPANCY) +DEFINE_GENERAL_4D_REORDER_KERNEL(1x256, r3102, byte, uchar, 256, TENSOR_REORDER_OCCUPANCY) +DEFINE_GENERAL_4D_REORDER_KERNEL(1x256, r3120, byte, uchar, 256, TENSOR_REORDER_OCCUPANCY) +DEFINE_GENERAL_4D_REORDER_KERNEL(1x256, r3201, byte, uchar, 256, TENSOR_REORDER_OCCUPANCY) +DEFINE_GENERAL_4D_REORDER_KERNEL(1x256, r3210, byte, uchar, 256, TENSOR_REORDER_OCCUPANCY) diff --git a/src/kernels/gpu_general_tensor_reorder_kernel/general_tensor_reorder_1x256_dword.cpp b/src/kernels/gpu_general_tensor_reorder_kernel/general_tensor_reorder_1x256_dword.cpp new file mode 100644 index 0000000000..8d90e2ff07 --- /dev/null +++ b/src/kernels/gpu_general_tensor_reorder_kernel/general_tensor_reorder_1x256_dword.cpp @@ -0,0 +1,49 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2020-2022 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +#include "general_tensor_reorder_kernel_util.hpp" +DEFINE_GENERAL_4D_REORDER_KERNEL(1x256, r0132, dword, float, 256, TENSOR_REORDER_OCCUPANCY) +DEFINE_GENERAL_4D_REORDER_KERNEL(1x256, r0213, dword, float, 256, TENSOR_REORDER_OCCUPANCY) +DEFINE_GENERAL_4D_REORDER_KERNEL(1x256, r0231, dword, float, 256, TENSOR_REORDER_OCCUPANCY) +DEFINE_GENERAL_4D_REORDER_KERNEL(1x256, r0312, dword, float, 256, TENSOR_REORDER_OCCUPANCY) +DEFINE_GENERAL_4D_REORDER_KERNEL(1x256, r0321, dword, float, 256, TENSOR_REORDER_OCCUPANCY) +DEFINE_GENERAL_4D_REORDER_KERNEL(1x256, r1023, dword, float, 256, TENSOR_REORDER_OCCUPANCY) +DEFINE_GENERAL_4D_REORDER_KERNEL(1x256, r1032, dword, float, 256, TENSOR_REORDER_OCCUPANCY) +DEFINE_GENERAL_4D_REORDER_KERNEL(1x256, r1203, dword, float, 256, TENSOR_REORDER_OCCUPANCY) +DEFINE_GENERAL_4D_REORDER_KERNEL(1x256, r1230, dword, float, 256, TENSOR_REORDER_OCCUPANCY) +DEFINE_GENERAL_4D_REORDER_KERNEL(1x256, r1302, dword, float, 256, TENSOR_REORDER_OCCUPANCY) +DEFINE_GENERAL_4D_REORDER_KERNEL(1x256, r1320, dword, float, 256, TENSOR_REORDER_OCCUPANCY) +DEFINE_GENERAL_4D_REORDER_KERNEL(1x256, r2013, dword, float, 256, TENSOR_REORDER_OCCUPANCY) +DEFINE_GENERAL_4D_REORDER_KERNEL(1x256, r2031, dword, float, 256, TENSOR_REORDER_OCCUPANCY) +DEFINE_GENERAL_4D_REORDER_KERNEL(1x256, r2103, dword, float, 256, TENSOR_REORDER_OCCUPANCY) +DEFINE_GENERAL_4D_REORDER_KERNEL(1x256, r2130, dword, float, 256, TENSOR_REORDER_OCCUPANCY) +DEFINE_GENERAL_4D_REORDER_KERNEL(1x256, r2301, dword, float, 256, TENSOR_REORDER_OCCUPANCY) +DEFINE_GENERAL_4D_REORDER_KERNEL(1x256, r2310, dword, float, 256, TENSOR_REORDER_OCCUPANCY) +DEFINE_GENERAL_4D_REORDER_KERNEL(1x256, r3012, dword, float, 256, TENSOR_REORDER_OCCUPANCY) +DEFINE_GENERAL_4D_REORDER_KERNEL(1x256, r3021, dword, float, 256, TENSOR_REORDER_OCCUPANCY) +DEFINE_GENERAL_4D_REORDER_KERNEL(1x256, r3102, dword, float, 256, TENSOR_REORDER_OCCUPANCY) +DEFINE_GENERAL_4D_REORDER_KERNEL(1x256, r3120, dword, float, 256, TENSOR_REORDER_OCCUPANCY) +DEFINE_GENERAL_4D_REORDER_KERNEL(1x256, r3201, dword, float, 256, TENSOR_REORDER_OCCUPANCY) +DEFINE_GENERAL_4D_REORDER_KERNEL(1x256, r3210, dword, float, 256, TENSOR_REORDER_OCCUPANCY) diff --git a/src/kernels/gpu_general_tensor_reorder_kernel/general_tensor_reorder_1x256_dwordx2.cpp b/src/kernels/gpu_general_tensor_reorder_kernel/general_tensor_reorder_1x256_dwordx2.cpp new file mode 100644 index 0000000000..fa5ec29887 --- /dev/null +++ b/src/kernels/gpu_general_tensor_reorder_kernel/general_tensor_reorder_1x256_dwordx2.cpp @@ -0,0 +1,49 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2020-2022 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +#include "general_tensor_reorder_kernel_util.hpp" +DEFINE_GENERAL_4D_REORDER_KERNEL(1x256, r0132, dwordx2, double, 256, TENSOR_REORDER_OCCUPANCY) +DEFINE_GENERAL_4D_REORDER_KERNEL(1x256, r0213, dwordx2, double, 256, TENSOR_REORDER_OCCUPANCY) +DEFINE_GENERAL_4D_REORDER_KERNEL(1x256, r0231, dwordx2, double, 256, TENSOR_REORDER_OCCUPANCY) +DEFINE_GENERAL_4D_REORDER_KERNEL(1x256, r0312, dwordx2, double, 256, TENSOR_REORDER_OCCUPANCY) +DEFINE_GENERAL_4D_REORDER_KERNEL(1x256, r0321, dwordx2, double, 256, TENSOR_REORDER_OCCUPANCY) +DEFINE_GENERAL_4D_REORDER_KERNEL(1x256, r1023, dwordx2, double, 256, TENSOR_REORDER_OCCUPANCY) +DEFINE_GENERAL_4D_REORDER_KERNEL(1x256, r1032, dwordx2, double, 256, TENSOR_REORDER_OCCUPANCY) +DEFINE_GENERAL_4D_REORDER_KERNEL(1x256, r1203, dwordx2, double, 256, TENSOR_REORDER_OCCUPANCY) +DEFINE_GENERAL_4D_REORDER_KERNEL(1x256, r1230, dwordx2, double, 256, TENSOR_REORDER_OCCUPANCY) +DEFINE_GENERAL_4D_REORDER_KERNEL(1x256, r1302, dwordx2, double, 256, TENSOR_REORDER_OCCUPANCY) +DEFINE_GENERAL_4D_REORDER_KERNEL(1x256, r1320, dwordx2, double, 256, TENSOR_REORDER_OCCUPANCY) +DEFINE_GENERAL_4D_REORDER_KERNEL(1x256, r2013, dwordx2, double, 256, TENSOR_REORDER_OCCUPANCY) +DEFINE_GENERAL_4D_REORDER_KERNEL(1x256, r2031, dwordx2, double, 256, TENSOR_REORDER_OCCUPANCY) +DEFINE_GENERAL_4D_REORDER_KERNEL(1x256, r2103, dwordx2, double, 256, TENSOR_REORDER_OCCUPANCY) +DEFINE_GENERAL_4D_REORDER_KERNEL(1x256, r2130, dwordx2, double, 256, TENSOR_REORDER_OCCUPANCY) +DEFINE_GENERAL_4D_REORDER_KERNEL(1x256, r2301, dwordx2, double, 256, TENSOR_REORDER_OCCUPANCY) +DEFINE_GENERAL_4D_REORDER_KERNEL(1x256, r2310, dwordx2, double, 256, TENSOR_REORDER_OCCUPANCY) +DEFINE_GENERAL_4D_REORDER_KERNEL(1x256, r3012, dwordx2, double, 256, TENSOR_REORDER_OCCUPANCY) +DEFINE_GENERAL_4D_REORDER_KERNEL(1x256, r3021, dwordx2, double, 256, TENSOR_REORDER_OCCUPANCY) +DEFINE_GENERAL_4D_REORDER_KERNEL(1x256, r3102, dwordx2, double, 256, TENSOR_REORDER_OCCUPANCY) +DEFINE_GENERAL_4D_REORDER_KERNEL(1x256, r3120, dwordx2, double, 256, TENSOR_REORDER_OCCUPANCY) +DEFINE_GENERAL_4D_REORDER_KERNEL(1x256, r3201, dwordx2, double, 256, TENSOR_REORDER_OCCUPANCY) +DEFINE_GENERAL_4D_REORDER_KERNEL(1x256, r3210, dwordx2, double, 256, TENSOR_REORDER_OCCUPANCY) diff --git a/src/kernels/gpu_general_tensor_reorder_kernel/general_tensor_reorder_1x256_half.cpp b/src/kernels/gpu_general_tensor_reorder_kernel/general_tensor_reorder_1x256_half.cpp new file mode 100644 index 0000000000..f7cc9315a0 --- /dev/null +++ b/src/kernels/gpu_general_tensor_reorder_kernel/general_tensor_reorder_1x256_half.cpp @@ -0,0 +1,49 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2020-2022 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +#include "general_tensor_reorder_kernel_util.hpp" +DEFINE_GENERAL_4D_REORDER_KERNEL(1x256, r0132, half, ushort, 256, TENSOR_REORDER_OCCUPANCY) +DEFINE_GENERAL_4D_REORDER_KERNEL(1x256, r0213, half, ushort, 256, TENSOR_REORDER_OCCUPANCY) +DEFINE_GENERAL_4D_REORDER_KERNEL(1x256, r0231, half, ushort, 256, TENSOR_REORDER_OCCUPANCY) +DEFINE_GENERAL_4D_REORDER_KERNEL(1x256, r0312, half, ushort, 256, TENSOR_REORDER_OCCUPANCY) +DEFINE_GENERAL_4D_REORDER_KERNEL(1x256, r0321, half, ushort, 256, TENSOR_REORDER_OCCUPANCY) +DEFINE_GENERAL_4D_REORDER_KERNEL(1x256, r1023, half, ushort, 256, TENSOR_REORDER_OCCUPANCY) +DEFINE_GENERAL_4D_REORDER_KERNEL(1x256, r1032, half, ushort, 256, TENSOR_REORDER_OCCUPANCY) +DEFINE_GENERAL_4D_REORDER_KERNEL(1x256, r1203, half, ushort, 256, TENSOR_REORDER_OCCUPANCY) +DEFINE_GENERAL_4D_REORDER_KERNEL(1x256, r1230, half, ushort, 256, TENSOR_REORDER_OCCUPANCY) +DEFINE_GENERAL_4D_REORDER_KERNEL(1x256, r1302, half, ushort, 256, TENSOR_REORDER_OCCUPANCY) +DEFINE_GENERAL_4D_REORDER_KERNEL(1x256, r1320, half, ushort, 256, TENSOR_REORDER_OCCUPANCY) +DEFINE_GENERAL_4D_REORDER_KERNEL(1x256, r2013, half, ushort, 256, TENSOR_REORDER_OCCUPANCY) +DEFINE_GENERAL_4D_REORDER_KERNEL(1x256, r2031, half, ushort, 256, TENSOR_REORDER_OCCUPANCY) +DEFINE_GENERAL_4D_REORDER_KERNEL(1x256, r2103, half, ushort, 256, TENSOR_REORDER_OCCUPANCY) +DEFINE_GENERAL_4D_REORDER_KERNEL(1x256, r2130, half, ushort, 256, TENSOR_REORDER_OCCUPANCY) +DEFINE_GENERAL_4D_REORDER_KERNEL(1x256, r2301, half, ushort, 256, TENSOR_REORDER_OCCUPANCY) +DEFINE_GENERAL_4D_REORDER_KERNEL(1x256, r2310, half, ushort, 256, TENSOR_REORDER_OCCUPANCY) +DEFINE_GENERAL_4D_REORDER_KERNEL(1x256, r3012, half, ushort, 256, TENSOR_REORDER_OCCUPANCY) +DEFINE_GENERAL_4D_REORDER_KERNEL(1x256, r3021, half, ushort, 256, TENSOR_REORDER_OCCUPANCY) +DEFINE_GENERAL_4D_REORDER_KERNEL(1x256, r3102, half, ushort, 256, TENSOR_REORDER_OCCUPANCY) +DEFINE_GENERAL_4D_REORDER_KERNEL(1x256, r3120, half, ushort, 256, TENSOR_REORDER_OCCUPANCY) +DEFINE_GENERAL_4D_REORDER_KERNEL(1x256, r3201, half, ushort, 256, TENSOR_REORDER_OCCUPANCY) +DEFINE_GENERAL_4D_REORDER_KERNEL(1x256, r3210, half, ushort, 256, TENSOR_REORDER_OCCUPANCY) diff --git a/src/kernels/gpu_general_tensor_reorder_kernel/general_tensor_reorder_2x256_byte.cpp b/src/kernels/gpu_general_tensor_reorder_kernel/general_tensor_reorder_2x256_byte.cpp new file mode 100644 index 0000000000..2da1b46c75 --- /dev/null +++ b/src/kernels/gpu_general_tensor_reorder_kernel/general_tensor_reorder_2x256_byte.cpp @@ -0,0 +1,49 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2020-2022 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +#include "general_tensor_reorder_kernel_util.hpp" +DEFINE_GENERAL_4D_REORDER_KERNEL(2x256, r0132, byte, uchar, 256, TENSOR_REORDER_OCCUPANCY) +DEFINE_GENERAL_4D_REORDER_KERNEL(2x256, r0213, byte, uchar, 256, TENSOR_REORDER_OCCUPANCY) +DEFINE_GENERAL_4D_REORDER_KERNEL(2x256, r0231, byte, uchar, 256, TENSOR_REORDER_OCCUPANCY) +DEFINE_GENERAL_4D_REORDER_KERNEL(2x256, r0312, byte, uchar, 256, TENSOR_REORDER_OCCUPANCY) +DEFINE_GENERAL_4D_REORDER_KERNEL(2x256, r0321, byte, uchar, 256, TENSOR_REORDER_OCCUPANCY) +DEFINE_GENERAL_4D_REORDER_KERNEL(2x256, r1023, byte, uchar, 256, TENSOR_REORDER_OCCUPANCY) +DEFINE_GENERAL_4D_REORDER_KERNEL(2x256, r1032, byte, uchar, 256, TENSOR_REORDER_OCCUPANCY) +DEFINE_GENERAL_4D_REORDER_KERNEL(2x256, r1203, byte, uchar, 256, TENSOR_REORDER_OCCUPANCY) +DEFINE_GENERAL_4D_REORDER_KERNEL(2x256, r1230, byte, uchar, 256, TENSOR_REORDER_OCCUPANCY) +DEFINE_GENERAL_4D_REORDER_KERNEL(2x256, r1302, byte, uchar, 256, TENSOR_REORDER_OCCUPANCY) +DEFINE_GENERAL_4D_REORDER_KERNEL(2x256, r1320, byte, uchar, 256, TENSOR_REORDER_OCCUPANCY) +DEFINE_GENERAL_4D_REORDER_KERNEL(2x256, r2013, byte, uchar, 256, TENSOR_REORDER_OCCUPANCY) +DEFINE_GENERAL_4D_REORDER_KERNEL(2x256, r2031, byte, uchar, 256, TENSOR_REORDER_OCCUPANCY) +DEFINE_GENERAL_4D_REORDER_KERNEL(2x256, r2103, byte, uchar, 256, TENSOR_REORDER_OCCUPANCY) +DEFINE_GENERAL_4D_REORDER_KERNEL(2x256, r2130, byte, uchar, 256, TENSOR_REORDER_OCCUPANCY) +DEFINE_GENERAL_4D_REORDER_KERNEL(2x256, r2301, byte, uchar, 256, TENSOR_REORDER_OCCUPANCY) +DEFINE_GENERAL_4D_REORDER_KERNEL(2x256, r2310, byte, uchar, 256, TENSOR_REORDER_OCCUPANCY) +DEFINE_GENERAL_4D_REORDER_KERNEL(2x256, r3012, byte, uchar, 256, TENSOR_REORDER_OCCUPANCY) +DEFINE_GENERAL_4D_REORDER_KERNEL(2x256, r3021, byte, uchar, 256, TENSOR_REORDER_OCCUPANCY) +DEFINE_GENERAL_4D_REORDER_KERNEL(2x256, r3102, byte, uchar, 256, TENSOR_REORDER_OCCUPANCY) +DEFINE_GENERAL_4D_REORDER_KERNEL(2x256, r3120, byte, uchar, 256, TENSOR_REORDER_OCCUPANCY) +DEFINE_GENERAL_4D_REORDER_KERNEL(2x256, r3201, byte, uchar, 256, TENSOR_REORDER_OCCUPANCY) +DEFINE_GENERAL_4D_REORDER_KERNEL(2x256, r3210, byte, uchar, 256, TENSOR_REORDER_OCCUPANCY) diff --git a/src/kernels/gpu_general_tensor_reorder_kernel/general_tensor_reorder_2x256_dword.cpp b/src/kernels/gpu_general_tensor_reorder_kernel/general_tensor_reorder_2x256_dword.cpp new file mode 100644 index 0000000000..b902bc4403 --- /dev/null +++ b/src/kernels/gpu_general_tensor_reorder_kernel/general_tensor_reorder_2x256_dword.cpp @@ -0,0 +1,49 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2020-2022 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +#include "general_tensor_reorder_kernel_util.hpp" +DEFINE_GENERAL_4D_REORDER_KERNEL(2x256, r0132, dword, float, 256, TENSOR_REORDER_OCCUPANCY) +DEFINE_GENERAL_4D_REORDER_KERNEL(2x256, r0213, dword, float, 256, TENSOR_REORDER_OCCUPANCY) +DEFINE_GENERAL_4D_REORDER_KERNEL(2x256, r0231, dword, float, 256, TENSOR_REORDER_OCCUPANCY) +DEFINE_GENERAL_4D_REORDER_KERNEL(2x256, r0312, dword, float, 256, TENSOR_REORDER_OCCUPANCY) +DEFINE_GENERAL_4D_REORDER_KERNEL(2x256, r0321, dword, float, 256, TENSOR_REORDER_OCCUPANCY) +DEFINE_GENERAL_4D_REORDER_KERNEL(2x256, r1023, dword, float, 256, TENSOR_REORDER_OCCUPANCY) +DEFINE_GENERAL_4D_REORDER_KERNEL(2x256, r1032, dword, float, 256, TENSOR_REORDER_OCCUPANCY) +DEFINE_GENERAL_4D_REORDER_KERNEL(2x256, r1203, dword, float, 256, TENSOR_REORDER_OCCUPANCY) +DEFINE_GENERAL_4D_REORDER_KERNEL(2x256, r1230, dword, float, 256, TENSOR_REORDER_OCCUPANCY) +DEFINE_GENERAL_4D_REORDER_KERNEL(2x256, r1302, dword, float, 256, TENSOR_REORDER_OCCUPANCY) +DEFINE_GENERAL_4D_REORDER_KERNEL(2x256, r1320, dword, float, 256, TENSOR_REORDER_OCCUPANCY) +DEFINE_GENERAL_4D_REORDER_KERNEL(2x256, r2013, dword, float, 256, TENSOR_REORDER_OCCUPANCY) +DEFINE_GENERAL_4D_REORDER_KERNEL(2x256, r2031, dword, float, 256, TENSOR_REORDER_OCCUPANCY) +DEFINE_GENERAL_4D_REORDER_KERNEL(2x256, r2103, dword, float, 256, TENSOR_REORDER_OCCUPANCY) +DEFINE_GENERAL_4D_REORDER_KERNEL(2x256, r2130, dword, float, 256, TENSOR_REORDER_OCCUPANCY) +DEFINE_GENERAL_4D_REORDER_KERNEL(2x256, r2301, dword, float, 256, TENSOR_REORDER_OCCUPANCY) +DEFINE_GENERAL_4D_REORDER_KERNEL(2x256, r2310, dword, float, 256, TENSOR_REORDER_OCCUPANCY) +DEFINE_GENERAL_4D_REORDER_KERNEL(2x256, r3012, dword, float, 256, TENSOR_REORDER_OCCUPANCY) +DEFINE_GENERAL_4D_REORDER_KERNEL(2x256, r3021, dword, float, 256, TENSOR_REORDER_OCCUPANCY) +DEFINE_GENERAL_4D_REORDER_KERNEL(2x256, r3102, dword, float, 256, TENSOR_REORDER_OCCUPANCY) +DEFINE_GENERAL_4D_REORDER_KERNEL(2x256, r3120, dword, float, 256, TENSOR_REORDER_OCCUPANCY) +DEFINE_GENERAL_4D_REORDER_KERNEL(2x256, r3201, dword, float, 256, TENSOR_REORDER_OCCUPANCY) +DEFINE_GENERAL_4D_REORDER_KERNEL(2x256, r3210, dword, float, 256, TENSOR_REORDER_OCCUPANCY) diff --git a/src/kernels/gpu_general_tensor_reorder_kernel/general_tensor_reorder_2x256_dwordx2.cpp b/src/kernels/gpu_general_tensor_reorder_kernel/general_tensor_reorder_2x256_dwordx2.cpp new file mode 100644 index 0000000000..58a25a2c00 --- /dev/null +++ b/src/kernels/gpu_general_tensor_reorder_kernel/general_tensor_reorder_2x256_dwordx2.cpp @@ -0,0 +1,49 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2020-2022 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +#include "general_tensor_reorder_kernel_util.hpp" +DEFINE_GENERAL_4D_REORDER_KERNEL(2x256, r0132, dwordx2, double, 256, TENSOR_REORDER_OCCUPANCY) +DEFINE_GENERAL_4D_REORDER_KERNEL(2x256, r0213, dwordx2, double, 256, TENSOR_REORDER_OCCUPANCY) +DEFINE_GENERAL_4D_REORDER_KERNEL(2x256, r0231, dwordx2, double, 256, TENSOR_REORDER_OCCUPANCY) +DEFINE_GENERAL_4D_REORDER_KERNEL(2x256, r0312, dwordx2, double, 256, TENSOR_REORDER_OCCUPANCY) +DEFINE_GENERAL_4D_REORDER_KERNEL(2x256, r0321, dwordx2, double, 256, TENSOR_REORDER_OCCUPANCY) +DEFINE_GENERAL_4D_REORDER_KERNEL(2x256, r1023, dwordx2, double, 256, TENSOR_REORDER_OCCUPANCY) +DEFINE_GENERAL_4D_REORDER_KERNEL(2x256, r1032, dwordx2, double, 256, TENSOR_REORDER_OCCUPANCY) +DEFINE_GENERAL_4D_REORDER_KERNEL(2x256, r1203, dwordx2, double, 256, TENSOR_REORDER_OCCUPANCY) +DEFINE_GENERAL_4D_REORDER_KERNEL(2x256, r1230, dwordx2, double, 256, TENSOR_REORDER_OCCUPANCY) +DEFINE_GENERAL_4D_REORDER_KERNEL(2x256, r1302, dwordx2, double, 256, TENSOR_REORDER_OCCUPANCY) +DEFINE_GENERAL_4D_REORDER_KERNEL(2x256, r1320, dwordx2, double, 256, TENSOR_REORDER_OCCUPANCY) +DEFINE_GENERAL_4D_REORDER_KERNEL(2x256, r2013, dwordx2, double, 256, TENSOR_REORDER_OCCUPANCY) +DEFINE_GENERAL_4D_REORDER_KERNEL(2x256, r2031, dwordx2, double, 256, TENSOR_REORDER_OCCUPANCY) +DEFINE_GENERAL_4D_REORDER_KERNEL(2x256, r2103, dwordx2, double, 256, TENSOR_REORDER_OCCUPANCY) +DEFINE_GENERAL_4D_REORDER_KERNEL(2x256, r2130, dwordx2, double, 256, TENSOR_REORDER_OCCUPANCY) +DEFINE_GENERAL_4D_REORDER_KERNEL(2x256, r2301, dwordx2, double, 256, TENSOR_REORDER_OCCUPANCY) +DEFINE_GENERAL_4D_REORDER_KERNEL(2x256, r2310, dwordx2, double, 256, TENSOR_REORDER_OCCUPANCY) +DEFINE_GENERAL_4D_REORDER_KERNEL(2x256, r3012, dwordx2, double, 256, TENSOR_REORDER_OCCUPANCY) +DEFINE_GENERAL_4D_REORDER_KERNEL(2x256, r3021, dwordx2, double, 256, TENSOR_REORDER_OCCUPANCY) +DEFINE_GENERAL_4D_REORDER_KERNEL(2x256, r3102, dwordx2, double, 256, TENSOR_REORDER_OCCUPANCY) +DEFINE_GENERAL_4D_REORDER_KERNEL(2x256, r3120, dwordx2, double, 256, TENSOR_REORDER_OCCUPANCY) +DEFINE_GENERAL_4D_REORDER_KERNEL(2x256, r3201, dwordx2, double, 256, TENSOR_REORDER_OCCUPANCY) +DEFINE_GENERAL_4D_REORDER_KERNEL(2x256, r3210, dwordx2, double, 256, TENSOR_REORDER_OCCUPANCY) diff --git a/src/kernels/gpu_general_tensor_reorder_kernel/general_tensor_reorder_2x256_half.cpp b/src/kernels/gpu_general_tensor_reorder_kernel/general_tensor_reorder_2x256_half.cpp new file mode 100644 index 0000000000..dcfd36342f --- /dev/null +++ b/src/kernels/gpu_general_tensor_reorder_kernel/general_tensor_reorder_2x256_half.cpp @@ -0,0 +1,49 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2020-2022 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +#include "general_tensor_reorder_kernel_util.hpp" +DEFINE_GENERAL_4D_REORDER_KERNEL(2x256, r0132, half, ushort, 256, TENSOR_REORDER_OCCUPANCY) +DEFINE_GENERAL_4D_REORDER_KERNEL(2x256, r0213, half, ushort, 256, TENSOR_REORDER_OCCUPANCY) +DEFINE_GENERAL_4D_REORDER_KERNEL(2x256, r0231, half, ushort, 256, TENSOR_REORDER_OCCUPANCY) +DEFINE_GENERAL_4D_REORDER_KERNEL(2x256, r0312, half, ushort, 256, TENSOR_REORDER_OCCUPANCY) +DEFINE_GENERAL_4D_REORDER_KERNEL(2x256, r0321, half, ushort, 256, TENSOR_REORDER_OCCUPANCY) +DEFINE_GENERAL_4D_REORDER_KERNEL(2x256, r1023, half, ushort, 256, TENSOR_REORDER_OCCUPANCY) +DEFINE_GENERAL_4D_REORDER_KERNEL(2x256, r1032, half, ushort, 256, TENSOR_REORDER_OCCUPANCY) +DEFINE_GENERAL_4D_REORDER_KERNEL(2x256, r1203, half, ushort, 256, TENSOR_REORDER_OCCUPANCY) +DEFINE_GENERAL_4D_REORDER_KERNEL(2x256, r1230, half, ushort, 256, TENSOR_REORDER_OCCUPANCY) +DEFINE_GENERAL_4D_REORDER_KERNEL(2x256, r1302, half, ushort, 256, TENSOR_REORDER_OCCUPANCY) +DEFINE_GENERAL_4D_REORDER_KERNEL(2x256, r1320, half, ushort, 256, TENSOR_REORDER_OCCUPANCY) +DEFINE_GENERAL_4D_REORDER_KERNEL(2x256, r2013, half, ushort, 256, TENSOR_REORDER_OCCUPANCY) +DEFINE_GENERAL_4D_REORDER_KERNEL(2x256, r2031, half, ushort, 256, TENSOR_REORDER_OCCUPANCY) +DEFINE_GENERAL_4D_REORDER_KERNEL(2x256, r2103, half, ushort, 256, TENSOR_REORDER_OCCUPANCY) +DEFINE_GENERAL_4D_REORDER_KERNEL(2x256, r2130, half, ushort, 256, TENSOR_REORDER_OCCUPANCY) +DEFINE_GENERAL_4D_REORDER_KERNEL(2x256, r2301, half, ushort, 256, TENSOR_REORDER_OCCUPANCY) +DEFINE_GENERAL_4D_REORDER_KERNEL(2x256, r2310, half, ushort, 256, TENSOR_REORDER_OCCUPANCY) +DEFINE_GENERAL_4D_REORDER_KERNEL(2x256, r3012, half, ushort, 256, TENSOR_REORDER_OCCUPANCY) +DEFINE_GENERAL_4D_REORDER_KERNEL(2x256, r3021, half, ushort, 256, TENSOR_REORDER_OCCUPANCY) +DEFINE_GENERAL_4D_REORDER_KERNEL(2x256, r3102, half, ushort, 256, TENSOR_REORDER_OCCUPANCY) +DEFINE_GENERAL_4D_REORDER_KERNEL(2x256, r3120, half, ushort, 256, TENSOR_REORDER_OCCUPANCY) +DEFINE_GENERAL_4D_REORDER_KERNEL(2x256, r3201, half, ushort, 256, TENSOR_REORDER_OCCUPANCY) +DEFINE_GENERAL_4D_REORDER_KERNEL(2x256, r3210, half, ushort, 256, TENSOR_REORDER_OCCUPANCY) diff --git a/src/kernels/gpu_general_tensor_reorder_kernel/general_tensor_reorder_4x256_byte.cpp b/src/kernels/gpu_general_tensor_reorder_kernel/general_tensor_reorder_4x256_byte.cpp new file mode 100644 index 0000000000..0cc96944a9 --- /dev/null +++ b/src/kernels/gpu_general_tensor_reorder_kernel/general_tensor_reorder_4x256_byte.cpp @@ -0,0 +1,49 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2020-2022 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +#include "general_tensor_reorder_kernel_util.hpp" +DEFINE_GENERAL_4D_REORDER_KERNEL(4x256, r0132, byte, uchar, 256, TENSOR_REORDER_OCCUPANCY) +DEFINE_GENERAL_4D_REORDER_KERNEL(4x256, r0213, byte, uchar, 256, TENSOR_REORDER_OCCUPANCY) +DEFINE_GENERAL_4D_REORDER_KERNEL(4x256, r0231, byte, uchar, 256, TENSOR_REORDER_OCCUPANCY) +DEFINE_GENERAL_4D_REORDER_KERNEL(4x256, r0312, byte, uchar, 256, TENSOR_REORDER_OCCUPANCY) +DEFINE_GENERAL_4D_REORDER_KERNEL(4x256, r0321, byte, uchar, 256, TENSOR_REORDER_OCCUPANCY) +DEFINE_GENERAL_4D_REORDER_KERNEL(4x256, r1023, byte, uchar, 256, TENSOR_REORDER_OCCUPANCY) +DEFINE_GENERAL_4D_REORDER_KERNEL(4x256, r1032, byte, uchar, 256, TENSOR_REORDER_OCCUPANCY) +DEFINE_GENERAL_4D_REORDER_KERNEL(4x256, r1203, byte, uchar, 256, TENSOR_REORDER_OCCUPANCY) +DEFINE_GENERAL_4D_REORDER_KERNEL(4x256, r1230, byte, uchar, 256, TENSOR_REORDER_OCCUPANCY) +DEFINE_GENERAL_4D_REORDER_KERNEL(4x256, r1302, byte, uchar, 256, TENSOR_REORDER_OCCUPANCY) +DEFINE_GENERAL_4D_REORDER_KERNEL(4x256, r1320, byte, uchar, 256, TENSOR_REORDER_OCCUPANCY) +DEFINE_GENERAL_4D_REORDER_KERNEL(4x256, r2013, byte, uchar, 256, TENSOR_REORDER_OCCUPANCY) +DEFINE_GENERAL_4D_REORDER_KERNEL(4x256, r2031, byte, uchar, 256, TENSOR_REORDER_OCCUPANCY) +DEFINE_GENERAL_4D_REORDER_KERNEL(4x256, r2103, byte, uchar, 256, TENSOR_REORDER_OCCUPANCY) +DEFINE_GENERAL_4D_REORDER_KERNEL(4x256, r2130, byte, uchar, 256, TENSOR_REORDER_OCCUPANCY) +DEFINE_GENERAL_4D_REORDER_KERNEL(4x256, r2301, byte, uchar, 256, TENSOR_REORDER_OCCUPANCY) +DEFINE_GENERAL_4D_REORDER_KERNEL(4x256, r2310, byte, uchar, 256, TENSOR_REORDER_OCCUPANCY) +DEFINE_GENERAL_4D_REORDER_KERNEL(4x256, r3012, byte, uchar, 256, TENSOR_REORDER_OCCUPANCY) +DEFINE_GENERAL_4D_REORDER_KERNEL(4x256, r3021, byte, uchar, 256, TENSOR_REORDER_OCCUPANCY) +DEFINE_GENERAL_4D_REORDER_KERNEL(4x256, r3102, byte, uchar, 256, TENSOR_REORDER_OCCUPANCY) +DEFINE_GENERAL_4D_REORDER_KERNEL(4x256, r3120, byte, uchar, 256, TENSOR_REORDER_OCCUPANCY) +DEFINE_GENERAL_4D_REORDER_KERNEL(4x256, r3201, byte, uchar, 256, TENSOR_REORDER_OCCUPANCY) +DEFINE_GENERAL_4D_REORDER_KERNEL(4x256, r3210, byte, uchar, 256, TENSOR_REORDER_OCCUPANCY) diff --git a/src/kernels/gpu_general_tensor_reorder_kernel/general_tensor_reorder_4x256_dword.cpp b/src/kernels/gpu_general_tensor_reorder_kernel/general_tensor_reorder_4x256_dword.cpp new file mode 100644 index 0000000000..a6f92c4a0f --- /dev/null +++ b/src/kernels/gpu_general_tensor_reorder_kernel/general_tensor_reorder_4x256_dword.cpp @@ -0,0 +1,49 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2020-2022 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +#include "general_tensor_reorder_kernel_util.hpp" +DEFINE_GENERAL_4D_REORDER_KERNEL(4x256, r0132, dword, float, 256, TENSOR_REORDER_OCCUPANCY) +DEFINE_GENERAL_4D_REORDER_KERNEL(4x256, r0213, dword, float, 256, TENSOR_REORDER_OCCUPANCY) +DEFINE_GENERAL_4D_REORDER_KERNEL(4x256, r0231, dword, float, 256, TENSOR_REORDER_OCCUPANCY) +DEFINE_GENERAL_4D_REORDER_KERNEL(4x256, r0312, dword, float, 256, TENSOR_REORDER_OCCUPANCY) +DEFINE_GENERAL_4D_REORDER_KERNEL(4x256, r0321, dword, float, 256, TENSOR_REORDER_OCCUPANCY) +DEFINE_GENERAL_4D_REORDER_KERNEL(4x256, r1023, dword, float, 256, TENSOR_REORDER_OCCUPANCY) +DEFINE_GENERAL_4D_REORDER_KERNEL(4x256, r1032, dword, float, 256, TENSOR_REORDER_OCCUPANCY) +DEFINE_GENERAL_4D_REORDER_KERNEL(4x256, r1203, dword, float, 256, TENSOR_REORDER_OCCUPANCY) +DEFINE_GENERAL_4D_REORDER_KERNEL(4x256, r1230, dword, float, 256, TENSOR_REORDER_OCCUPANCY) +DEFINE_GENERAL_4D_REORDER_KERNEL(4x256, r1302, dword, float, 256, TENSOR_REORDER_OCCUPANCY) +DEFINE_GENERAL_4D_REORDER_KERNEL(4x256, r1320, dword, float, 256, TENSOR_REORDER_OCCUPANCY) +DEFINE_GENERAL_4D_REORDER_KERNEL(4x256, r2013, dword, float, 256, TENSOR_REORDER_OCCUPANCY) +DEFINE_GENERAL_4D_REORDER_KERNEL(4x256, r2031, dword, float, 256, TENSOR_REORDER_OCCUPANCY) +DEFINE_GENERAL_4D_REORDER_KERNEL(4x256, r2103, dword, float, 256, TENSOR_REORDER_OCCUPANCY) +DEFINE_GENERAL_4D_REORDER_KERNEL(4x256, r2130, dword, float, 256, TENSOR_REORDER_OCCUPANCY) +DEFINE_GENERAL_4D_REORDER_KERNEL(4x256, r2301, dword, float, 256, TENSOR_REORDER_OCCUPANCY) +DEFINE_GENERAL_4D_REORDER_KERNEL(4x256, r2310, dword, float, 256, TENSOR_REORDER_OCCUPANCY) +DEFINE_GENERAL_4D_REORDER_KERNEL(4x256, r3012, dword, float, 256, TENSOR_REORDER_OCCUPANCY) +DEFINE_GENERAL_4D_REORDER_KERNEL(4x256, r3021, dword, float, 256, TENSOR_REORDER_OCCUPANCY) +DEFINE_GENERAL_4D_REORDER_KERNEL(4x256, r3102, dword, float, 256, TENSOR_REORDER_OCCUPANCY) +DEFINE_GENERAL_4D_REORDER_KERNEL(4x256, r3120, dword, float, 256, TENSOR_REORDER_OCCUPANCY) +DEFINE_GENERAL_4D_REORDER_KERNEL(4x256, r3201, dword, float, 256, TENSOR_REORDER_OCCUPANCY) +DEFINE_GENERAL_4D_REORDER_KERNEL(4x256, r3210, dword, float, 256, TENSOR_REORDER_OCCUPANCY) diff --git a/src/kernels/gpu_general_tensor_reorder_kernel/general_tensor_reorder_4x256_dwordx2.cpp b/src/kernels/gpu_general_tensor_reorder_kernel/general_tensor_reorder_4x256_dwordx2.cpp new file mode 100644 index 0000000000..42ffd27a72 --- /dev/null +++ b/src/kernels/gpu_general_tensor_reorder_kernel/general_tensor_reorder_4x256_dwordx2.cpp @@ -0,0 +1,49 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2020-2022 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +#include "general_tensor_reorder_kernel_util.hpp" +DEFINE_GENERAL_4D_REORDER_KERNEL(4x256, r0132, dwordx2, double, 256, TENSOR_REORDER_OCCUPANCY) +DEFINE_GENERAL_4D_REORDER_KERNEL(4x256, r0213, dwordx2, double, 256, TENSOR_REORDER_OCCUPANCY) +DEFINE_GENERAL_4D_REORDER_KERNEL(4x256, r0231, dwordx2, double, 256, TENSOR_REORDER_OCCUPANCY) +DEFINE_GENERAL_4D_REORDER_KERNEL(4x256, r0312, dwordx2, double, 256, TENSOR_REORDER_OCCUPANCY) +DEFINE_GENERAL_4D_REORDER_KERNEL(4x256, r0321, dwordx2, double, 256, TENSOR_REORDER_OCCUPANCY) +DEFINE_GENERAL_4D_REORDER_KERNEL(4x256, r1023, dwordx2, double, 256, TENSOR_REORDER_OCCUPANCY) +DEFINE_GENERAL_4D_REORDER_KERNEL(4x256, r1032, dwordx2, double, 256, TENSOR_REORDER_OCCUPANCY) +DEFINE_GENERAL_4D_REORDER_KERNEL(4x256, r1203, dwordx2, double, 256, TENSOR_REORDER_OCCUPANCY) +DEFINE_GENERAL_4D_REORDER_KERNEL(4x256, r1230, dwordx2, double, 256, TENSOR_REORDER_OCCUPANCY) +DEFINE_GENERAL_4D_REORDER_KERNEL(4x256, r1302, dwordx2, double, 256, TENSOR_REORDER_OCCUPANCY) +DEFINE_GENERAL_4D_REORDER_KERNEL(4x256, r1320, dwordx2, double, 256, TENSOR_REORDER_OCCUPANCY) +DEFINE_GENERAL_4D_REORDER_KERNEL(4x256, r2013, dwordx2, double, 256, TENSOR_REORDER_OCCUPANCY) +DEFINE_GENERAL_4D_REORDER_KERNEL(4x256, r2031, dwordx2, double, 256, TENSOR_REORDER_OCCUPANCY) +DEFINE_GENERAL_4D_REORDER_KERNEL(4x256, r2103, dwordx2, double, 256, TENSOR_REORDER_OCCUPANCY) +DEFINE_GENERAL_4D_REORDER_KERNEL(4x256, r2130, dwordx2, double, 256, TENSOR_REORDER_OCCUPANCY) +DEFINE_GENERAL_4D_REORDER_KERNEL(4x256, r2301, dwordx2, double, 256, TENSOR_REORDER_OCCUPANCY) +DEFINE_GENERAL_4D_REORDER_KERNEL(4x256, r2310, dwordx2, double, 256, TENSOR_REORDER_OCCUPANCY) +DEFINE_GENERAL_4D_REORDER_KERNEL(4x256, r3012, dwordx2, double, 256, TENSOR_REORDER_OCCUPANCY) +DEFINE_GENERAL_4D_REORDER_KERNEL(4x256, r3021, dwordx2, double, 256, TENSOR_REORDER_OCCUPANCY) +DEFINE_GENERAL_4D_REORDER_KERNEL(4x256, r3102, dwordx2, double, 256, TENSOR_REORDER_OCCUPANCY) +DEFINE_GENERAL_4D_REORDER_KERNEL(4x256, r3120, dwordx2, double, 256, TENSOR_REORDER_OCCUPANCY) +DEFINE_GENERAL_4D_REORDER_KERNEL(4x256, r3201, dwordx2, double, 256, TENSOR_REORDER_OCCUPANCY) +DEFINE_GENERAL_4D_REORDER_KERNEL(4x256, r3210, dwordx2, double, 256, TENSOR_REORDER_OCCUPANCY) diff --git a/src/kernels/gpu_general_tensor_reorder_kernel/general_tensor_reorder_4x256_half.cpp b/src/kernels/gpu_general_tensor_reorder_kernel/general_tensor_reorder_4x256_half.cpp new file mode 100644 index 0000000000..aebe744280 --- /dev/null +++ b/src/kernels/gpu_general_tensor_reorder_kernel/general_tensor_reorder_4x256_half.cpp @@ -0,0 +1,49 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2020-2022 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +#include "general_tensor_reorder_kernel_util.hpp" +DEFINE_GENERAL_4D_REORDER_KERNEL(4x256, r0132, half, ushort, 256, TENSOR_REORDER_OCCUPANCY) +DEFINE_GENERAL_4D_REORDER_KERNEL(4x256, r0213, half, ushort, 256, TENSOR_REORDER_OCCUPANCY) +DEFINE_GENERAL_4D_REORDER_KERNEL(4x256, r0231, half, ushort, 256, TENSOR_REORDER_OCCUPANCY) +DEFINE_GENERAL_4D_REORDER_KERNEL(4x256, r0312, half, ushort, 256, TENSOR_REORDER_OCCUPANCY) +DEFINE_GENERAL_4D_REORDER_KERNEL(4x256, r0321, half, ushort, 256, TENSOR_REORDER_OCCUPANCY) +DEFINE_GENERAL_4D_REORDER_KERNEL(4x256, r1023, half, ushort, 256, TENSOR_REORDER_OCCUPANCY) +DEFINE_GENERAL_4D_REORDER_KERNEL(4x256, r1032, half, ushort, 256, TENSOR_REORDER_OCCUPANCY) +DEFINE_GENERAL_4D_REORDER_KERNEL(4x256, r1203, half, ushort, 256, TENSOR_REORDER_OCCUPANCY) +DEFINE_GENERAL_4D_REORDER_KERNEL(4x256, r1230, half, ushort, 256, TENSOR_REORDER_OCCUPANCY) +DEFINE_GENERAL_4D_REORDER_KERNEL(4x256, r1302, half, ushort, 256, TENSOR_REORDER_OCCUPANCY) +DEFINE_GENERAL_4D_REORDER_KERNEL(4x256, r1320, half, ushort, 256, TENSOR_REORDER_OCCUPANCY) +DEFINE_GENERAL_4D_REORDER_KERNEL(4x256, r2013, half, ushort, 256, TENSOR_REORDER_OCCUPANCY) +DEFINE_GENERAL_4D_REORDER_KERNEL(4x256, r2031, half, ushort, 256, TENSOR_REORDER_OCCUPANCY) +DEFINE_GENERAL_4D_REORDER_KERNEL(4x256, r2103, half, ushort, 256, TENSOR_REORDER_OCCUPANCY) +DEFINE_GENERAL_4D_REORDER_KERNEL(4x256, r2130, half, ushort, 256, TENSOR_REORDER_OCCUPANCY) +DEFINE_GENERAL_4D_REORDER_KERNEL(4x256, r2301, half, ushort, 256, TENSOR_REORDER_OCCUPANCY) +DEFINE_GENERAL_4D_REORDER_KERNEL(4x256, r2310, half, ushort, 256, TENSOR_REORDER_OCCUPANCY) +DEFINE_GENERAL_4D_REORDER_KERNEL(4x256, r3012, half, ushort, 256, TENSOR_REORDER_OCCUPANCY) +DEFINE_GENERAL_4D_REORDER_KERNEL(4x256, r3021, half, ushort, 256, TENSOR_REORDER_OCCUPANCY) +DEFINE_GENERAL_4D_REORDER_KERNEL(4x256, r3102, half, ushort, 256, TENSOR_REORDER_OCCUPANCY) +DEFINE_GENERAL_4D_REORDER_KERNEL(4x256, r3120, half, ushort, 256, TENSOR_REORDER_OCCUPANCY) +DEFINE_GENERAL_4D_REORDER_KERNEL(4x256, r3201, half, ushort, 256, TENSOR_REORDER_OCCUPANCY) +DEFINE_GENERAL_4D_REORDER_KERNEL(4x256, r3210, half, ushort, 256, TENSOR_REORDER_OCCUPANCY) diff --git a/src/kernels/gpu_general_tensor_reorder_kernel/general_tensor_reorder_8x256_byte.cpp b/src/kernels/gpu_general_tensor_reorder_kernel/general_tensor_reorder_8x256_byte.cpp new file mode 100644 index 0000000000..51b68958f5 --- /dev/null +++ b/src/kernels/gpu_general_tensor_reorder_kernel/general_tensor_reorder_8x256_byte.cpp @@ -0,0 +1,49 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2020-2022 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +#include "general_tensor_reorder_kernel_util.hpp" +DEFINE_GENERAL_4D_REORDER_KERNEL(8x256, r0132, byte, uchar, 256, TENSOR_REORDER_OCCUPANCY) +DEFINE_GENERAL_4D_REORDER_KERNEL(8x256, r0213, byte, uchar, 256, TENSOR_REORDER_OCCUPANCY) +DEFINE_GENERAL_4D_REORDER_KERNEL(8x256, r0231, byte, uchar, 256, TENSOR_REORDER_OCCUPANCY) +DEFINE_GENERAL_4D_REORDER_KERNEL(8x256, r0312, byte, uchar, 256, TENSOR_REORDER_OCCUPANCY) +DEFINE_GENERAL_4D_REORDER_KERNEL(8x256, r0321, byte, uchar, 256, TENSOR_REORDER_OCCUPANCY) +DEFINE_GENERAL_4D_REORDER_KERNEL(8x256, r1023, byte, uchar, 256, TENSOR_REORDER_OCCUPANCY) +DEFINE_GENERAL_4D_REORDER_KERNEL(8x256, r1032, byte, uchar, 256, TENSOR_REORDER_OCCUPANCY) +DEFINE_GENERAL_4D_REORDER_KERNEL(8x256, r1203, byte, uchar, 256, TENSOR_REORDER_OCCUPANCY) +DEFINE_GENERAL_4D_REORDER_KERNEL(8x256, r1230, byte, uchar, 256, TENSOR_REORDER_OCCUPANCY) +DEFINE_GENERAL_4D_REORDER_KERNEL(8x256, r1302, byte, uchar, 256, TENSOR_REORDER_OCCUPANCY) +DEFINE_GENERAL_4D_REORDER_KERNEL(8x256, r1320, byte, uchar, 256, TENSOR_REORDER_OCCUPANCY) +DEFINE_GENERAL_4D_REORDER_KERNEL(8x256, r2013, byte, uchar, 256, TENSOR_REORDER_OCCUPANCY) +DEFINE_GENERAL_4D_REORDER_KERNEL(8x256, r2031, byte, uchar, 256, TENSOR_REORDER_OCCUPANCY) +DEFINE_GENERAL_4D_REORDER_KERNEL(8x256, r2103, byte, uchar, 256, TENSOR_REORDER_OCCUPANCY) +DEFINE_GENERAL_4D_REORDER_KERNEL(8x256, r2130, byte, uchar, 256, TENSOR_REORDER_OCCUPANCY) +DEFINE_GENERAL_4D_REORDER_KERNEL(8x256, r2301, byte, uchar, 256, TENSOR_REORDER_OCCUPANCY) +DEFINE_GENERAL_4D_REORDER_KERNEL(8x256, r2310, byte, uchar, 256, TENSOR_REORDER_OCCUPANCY) +DEFINE_GENERAL_4D_REORDER_KERNEL(8x256, r3012, byte, uchar, 256, TENSOR_REORDER_OCCUPANCY) +DEFINE_GENERAL_4D_REORDER_KERNEL(8x256, r3021, byte, uchar, 256, TENSOR_REORDER_OCCUPANCY) +DEFINE_GENERAL_4D_REORDER_KERNEL(8x256, r3102, byte, uchar, 256, TENSOR_REORDER_OCCUPANCY) +DEFINE_GENERAL_4D_REORDER_KERNEL(8x256, r3120, byte, uchar, 256, TENSOR_REORDER_OCCUPANCY) +DEFINE_GENERAL_4D_REORDER_KERNEL(8x256, r3201, byte, uchar, 256, TENSOR_REORDER_OCCUPANCY) +DEFINE_GENERAL_4D_REORDER_KERNEL(8x256, r3210, byte, uchar, 256, TENSOR_REORDER_OCCUPANCY) diff --git a/src/kernels/gpu_general_tensor_reorder_kernel/general_tensor_reorder_8x256_dword.cpp b/src/kernels/gpu_general_tensor_reorder_kernel/general_tensor_reorder_8x256_dword.cpp new file mode 100644 index 0000000000..f8c7d50084 --- /dev/null +++ b/src/kernels/gpu_general_tensor_reorder_kernel/general_tensor_reorder_8x256_dword.cpp @@ -0,0 +1,49 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2020-2022 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +#include "general_tensor_reorder_kernel_util.hpp" +DEFINE_GENERAL_4D_REORDER_KERNEL(8x256, r0132, dword, float, 256, TENSOR_REORDER_OCCUPANCY) +DEFINE_GENERAL_4D_REORDER_KERNEL(8x256, r0213, dword, float, 256, TENSOR_REORDER_OCCUPANCY) +DEFINE_GENERAL_4D_REORDER_KERNEL(8x256, r0231, dword, float, 256, TENSOR_REORDER_OCCUPANCY) +DEFINE_GENERAL_4D_REORDER_KERNEL(8x256, r0312, dword, float, 256, TENSOR_REORDER_OCCUPANCY) +DEFINE_GENERAL_4D_REORDER_KERNEL(8x256, r0321, dword, float, 256, TENSOR_REORDER_OCCUPANCY) +DEFINE_GENERAL_4D_REORDER_KERNEL(8x256, r1023, dword, float, 256, TENSOR_REORDER_OCCUPANCY) +DEFINE_GENERAL_4D_REORDER_KERNEL(8x256, r1032, dword, float, 256, TENSOR_REORDER_OCCUPANCY) +DEFINE_GENERAL_4D_REORDER_KERNEL(8x256, r1203, dword, float, 256, TENSOR_REORDER_OCCUPANCY) +DEFINE_GENERAL_4D_REORDER_KERNEL(8x256, r1230, dword, float, 256, TENSOR_REORDER_OCCUPANCY) +DEFINE_GENERAL_4D_REORDER_KERNEL(8x256, r1302, dword, float, 256, TENSOR_REORDER_OCCUPANCY) +DEFINE_GENERAL_4D_REORDER_KERNEL(8x256, r1320, dword, float, 256, TENSOR_REORDER_OCCUPANCY) +DEFINE_GENERAL_4D_REORDER_KERNEL(8x256, r2013, dword, float, 256, TENSOR_REORDER_OCCUPANCY) +DEFINE_GENERAL_4D_REORDER_KERNEL(8x256, r2031, dword, float, 256, TENSOR_REORDER_OCCUPANCY) +DEFINE_GENERAL_4D_REORDER_KERNEL(8x256, r2103, dword, float, 256, TENSOR_REORDER_OCCUPANCY) +DEFINE_GENERAL_4D_REORDER_KERNEL(8x256, r2130, dword, float, 256, TENSOR_REORDER_OCCUPANCY) +DEFINE_GENERAL_4D_REORDER_KERNEL(8x256, r2301, dword, float, 256, TENSOR_REORDER_OCCUPANCY) +DEFINE_GENERAL_4D_REORDER_KERNEL(8x256, r2310, dword, float, 256, TENSOR_REORDER_OCCUPANCY) +DEFINE_GENERAL_4D_REORDER_KERNEL(8x256, r3012, dword, float, 256, TENSOR_REORDER_OCCUPANCY) +DEFINE_GENERAL_4D_REORDER_KERNEL(8x256, r3021, dword, float, 256, TENSOR_REORDER_OCCUPANCY) +DEFINE_GENERAL_4D_REORDER_KERNEL(8x256, r3102, dword, float, 256, TENSOR_REORDER_OCCUPANCY) +DEFINE_GENERAL_4D_REORDER_KERNEL(8x256, r3120, dword, float, 256, TENSOR_REORDER_OCCUPANCY) +DEFINE_GENERAL_4D_REORDER_KERNEL(8x256, r3201, dword, float, 256, TENSOR_REORDER_OCCUPANCY) +DEFINE_GENERAL_4D_REORDER_KERNEL(8x256, r3210, dword, float, 256, TENSOR_REORDER_OCCUPANCY) diff --git a/src/kernels/gpu_general_tensor_reorder_kernel/general_tensor_reorder_8x256_dwordx2.cpp b/src/kernels/gpu_general_tensor_reorder_kernel/general_tensor_reorder_8x256_dwordx2.cpp new file mode 100644 index 0000000000..682dd19b6f --- /dev/null +++ b/src/kernels/gpu_general_tensor_reorder_kernel/general_tensor_reorder_8x256_dwordx2.cpp @@ -0,0 +1,49 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2020-2022 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +#include "general_tensor_reorder_kernel_util.hpp" +DEFINE_GENERAL_4D_REORDER_KERNEL(8x256, r0132, dwordx2, double, 256, TENSOR_REORDER_OCCUPANCY) +DEFINE_GENERAL_4D_REORDER_KERNEL(8x256, r0213, dwordx2, double, 256, TENSOR_REORDER_OCCUPANCY) +DEFINE_GENERAL_4D_REORDER_KERNEL(8x256, r0231, dwordx2, double, 256, TENSOR_REORDER_OCCUPANCY) +DEFINE_GENERAL_4D_REORDER_KERNEL(8x256, r0312, dwordx2, double, 256, TENSOR_REORDER_OCCUPANCY) +DEFINE_GENERAL_4D_REORDER_KERNEL(8x256, r0321, dwordx2, double, 256, TENSOR_REORDER_OCCUPANCY) +DEFINE_GENERAL_4D_REORDER_KERNEL(8x256, r1023, dwordx2, double, 256, TENSOR_REORDER_OCCUPANCY) +DEFINE_GENERAL_4D_REORDER_KERNEL(8x256, r1032, dwordx2, double, 256, TENSOR_REORDER_OCCUPANCY) +DEFINE_GENERAL_4D_REORDER_KERNEL(8x256, r1203, dwordx2, double, 256, TENSOR_REORDER_OCCUPANCY) +DEFINE_GENERAL_4D_REORDER_KERNEL(8x256, r1230, dwordx2, double, 256, TENSOR_REORDER_OCCUPANCY) +DEFINE_GENERAL_4D_REORDER_KERNEL(8x256, r1302, dwordx2, double, 256, TENSOR_REORDER_OCCUPANCY) +DEFINE_GENERAL_4D_REORDER_KERNEL(8x256, r1320, dwordx2, double, 256, TENSOR_REORDER_OCCUPANCY) +DEFINE_GENERAL_4D_REORDER_KERNEL(8x256, r2013, dwordx2, double, 256, TENSOR_REORDER_OCCUPANCY) +DEFINE_GENERAL_4D_REORDER_KERNEL(8x256, r2031, dwordx2, double, 256, TENSOR_REORDER_OCCUPANCY) +DEFINE_GENERAL_4D_REORDER_KERNEL(8x256, r2103, dwordx2, double, 256, TENSOR_REORDER_OCCUPANCY) +DEFINE_GENERAL_4D_REORDER_KERNEL(8x256, r2130, dwordx2, double, 256, TENSOR_REORDER_OCCUPANCY) +DEFINE_GENERAL_4D_REORDER_KERNEL(8x256, r2301, dwordx2, double, 256, TENSOR_REORDER_OCCUPANCY) +DEFINE_GENERAL_4D_REORDER_KERNEL(8x256, r2310, dwordx2, double, 256, TENSOR_REORDER_OCCUPANCY) +DEFINE_GENERAL_4D_REORDER_KERNEL(8x256, r3012, dwordx2, double, 256, TENSOR_REORDER_OCCUPANCY) +DEFINE_GENERAL_4D_REORDER_KERNEL(8x256, r3021, dwordx2, double, 256, TENSOR_REORDER_OCCUPANCY) +DEFINE_GENERAL_4D_REORDER_KERNEL(8x256, r3102, dwordx2, double, 256, TENSOR_REORDER_OCCUPANCY) +DEFINE_GENERAL_4D_REORDER_KERNEL(8x256, r3120, dwordx2, double, 256, TENSOR_REORDER_OCCUPANCY) +DEFINE_GENERAL_4D_REORDER_KERNEL(8x256, r3201, dwordx2, double, 256, TENSOR_REORDER_OCCUPANCY) +DEFINE_GENERAL_4D_REORDER_KERNEL(8x256, r3210, dwordx2, double, 256, TENSOR_REORDER_OCCUPANCY) diff --git a/src/kernels/gpu_general_tensor_reorder_kernel/general_tensor_reorder_8x256_half.cpp b/src/kernels/gpu_general_tensor_reorder_kernel/general_tensor_reorder_8x256_half.cpp new file mode 100644 index 0000000000..c25649db3d --- /dev/null +++ b/src/kernels/gpu_general_tensor_reorder_kernel/general_tensor_reorder_8x256_half.cpp @@ -0,0 +1,49 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2020-2022 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +#include "general_tensor_reorder_kernel_util.hpp" +DEFINE_GENERAL_4D_REORDER_KERNEL(8x256, r0132, half, ushort, 256, TENSOR_REORDER_OCCUPANCY) +DEFINE_GENERAL_4D_REORDER_KERNEL(8x256, r0213, half, ushort, 256, TENSOR_REORDER_OCCUPANCY) +DEFINE_GENERAL_4D_REORDER_KERNEL(8x256, r0231, half, ushort, 256, TENSOR_REORDER_OCCUPANCY) +DEFINE_GENERAL_4D_REORDER_KERNEL(8x256, r0312, half, ushort, 256, TENSOR_REORDER_OCCUPANCY) +DEFINE_GENERAL_4D_REORDER_KERNEL(8x256, r0321, half, ushort, 256, TENSOR_REORDER_OCCUPANCY) +DEFINE_GENERAL_4D_REORDER_KERNEL(8x256, r1023, half, ushort, 256, TENSOR_REORDER_OCCUPANCY) +DEFINE_GENERAL_4D_REORDER_KERNEL(8x256, r1032, half, ushort, 256, TENSOR_REORDER_OCCUPANCY) +DEFINE_GENERAL_4D_REORDER_KERNEL(8x256, r1203, half, ushort, 256, TENSOR_REORDER_OCCUPANCY) +DEFINE_GENERAL_4D_REORDER_KERNEL(8x256, r1230, half, ushort, 256, TENSOR_REORDER_OCCUPANCY) +DEFINE_GENERAL_4D_REORDER_KERNEL(8x256, r1302, half, ushort, 256, TENSOR_REORDER_OCCUPANCY) +DEFINE_GENERAL_4D_REORDER_KERNEL(8x256, r1320, half, ushort, 256, TENSOR_REORDER_OCCUPANCY) +DEFINE_GENERAL_4D_REORDER_KERNEL(8x256, r2013, half, ushort, 256, TENSOR_REORDER_OCCUPANCY) +DEFINE_GENERAL_4D_REORDER_KERNEL(8x256, r2031, half, ushort, 256, TENSOR_REORDER_OCCUPANCY) +DEFINE_GENERAL_4D_REORDER_KERNEL(8x256, r2103, half, ushort, 256, TENSOR_REORDER_OCCUPANCY) +DEFINE_GENERAL_4D_REORDER_KERNEL(8x256, r2130, half, ushort, 256, TENSOR_REORDER_OCCUPANCY) +DEFINE_GENERAL_4D_REORDER_KERNEL(8x256, r2301, half, ushort, 256, TENSOR_REORDER_OCCUPANCY) +DEFINE_GENERAL_4D_REORDER_KERNEL(8x256, r2310, half, ushort, 256, TENSOR_REORDER_OCCUPANCY) +DEFINE_GENERAL_4D_REORDER_KERNEL(8x256, r3012, half, ushort, 256, TENSOR_REORDER_OCCUPANCY) +DEFINE_GENERAL_4D_REORDER_KERNEL(8x256, r3021, half, ushort, 256, TENSOR_REORDER_OCCUPANCY) +DEFINE_GENERAL_4D_REORDER_KERNEL(8x256, r3102, half, ushort, 256, TENSOR_REORDER_OCCUPANCY) +DEFINE_GENERAL_4D_REORDER_KERNEL(8x256, r3120, half, ushort, 256, TENSOR_REORDER_OCCUPANCY) +DEFINE_GENERAL_4D_REORDER_KERNEL(8x256, r3201, half, ushort, 256, TENSOR_REORDER_OCCUPANCY) +DEFINE_GENERAL_4D_REORDER_KERNEL(8x256, r3210, half, ushort, 256, TENSOR_REORDER_OCCUPANCY) diff --git a/src/kernels/gpu_general_tensor_reorder_kernel/general_tensor_reorder_kernel_util.hpp b/src/kernels/gpu_general_tensor_reorder_kernel/general_tensor_reorder_kernel_util.hpp new file mode 100644 index 0000000000..3568627990 --- /dev/null +++ b/src/kernels/gpu_general_tensor_reorder_kernel/general_tensor_reorder_kernel_util.hpp @@ -0,0 +1,428 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2020-2022 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +#ifndef GENERAL_TENSOR_REORDER_UTIL_HPP +#include + +#ifndef MIOPEN_DONT_USE_HIP_RUNTIME_HEADERS +#include +#include +#endif + +#ifndef TENSOR_REORDER_OCCUPANCY +#define TENSOR_REORDER_OCCUPANCY 4 +#endif +#define GENERAL_TENSOR_REORDER_UTIL_HPP + +template +struct order +{ + static constexpr std::size_t m_size = sizeof...(Is); + // the last dummy element is to prevent compiler complain about empty array, when mSize = 0 + static constexpr int m_data[m_size + 1] = {Is..., 0}; + + __host__ __device__ static constexpr uint64_t size() { return m_size; } + + __host__ __device__ static constexpr uint64_t get_size() { return size(); } + + __host__ __device__ static constexpr int at(int I) { return m_data[I]; } +}; + +inline __device__ uint32_t magic_div_u32(const uint32_t& numer, + const uint32_t& magic, + const uint32_t& shift) +{ + uint32_t tmp = __umulhi(numer, magic); + return (tmp + numer) >> shift; +} + +template +inline __device__ void general_4d_reorder_1x256(T* dst, + T* src, + uint32_t dim_0, + uint32_t dim_1, + uint32_t dim_2, + uint32_t dim_3, + uint32_t dim_stride, + uint32_t dim_total, + uint32_t magic_stride0, + uint32_t shift_stride0, + uint32_t magic_stride1, + uint32_t shift_stride1, + uint32_t magic_stride2, + uint32_t shift_stride2) +{ + constexpr auto dorder = dst_order{}; + uint32_t pixel_total = dim_0 * dim_1 * dim_2 * dim_3; + uint32_t src_index, dst_index; + const uint64_t src_dim[4] = {dim_0, dim_1, dim_2, dim_3}; + const uint64_t dst_dim[4] = { + src_dim[dorder.at(0)], src_dim[dorder.at(1)], src_dim[dorder.at(2)], src_dim[dorder.at(3)]}; + const uint64_t src_stride[4] = { + src_dim[1] * src_dim[2] * src_dim[3], src_dim[2] * src_dim[3], src_dim[3], 1}; + const uint64_t dst_stride[4] = { + dst_dim[1] * dst_dim[2] * dst_dim[3], dst_dim[2] * dst_dim[3], dst_dim[3], 1}; + + uint32_t i_src[4] = {0, 0, 0, 0}; + uint32_t i_dst[4] = {0, 0, 0, 0}; + + for(uint32_t dim_id = blockIdx.x; dim_id < dim_total; dim_id += dim_stride) + { + for(uint32_t k = 0; k < 1; k++) + { + // unroll k block thread + src_index = k * dim_total * 256 + dim_id * 256 + threadIdx.x; + if(src_index < pixel_total) + { + i_src[0] = magic_div_u32(src_index, magic_stride0, shift_stride0); + i_src[1] = magic_div_u32( + src_index - i_src[0] * src_stride[0], magic_stride1, shift_stride1); + i_src[2] = + magic_div_u32(src_index - i_src[0] * src_stride[0] - i_src[1] * src_stride[1], + magic_stride2, + shift_stride2); + i_src[3] = src_index - i_src[0] * src_stride[0] - i_src[1] * src_stride[1] - + i_src[2] * src_stride[2]; + + i_dst[0] = i_src[dorder.at(0)]; + i_dst[1] = i_src[dorder.at(1)]; + i_dst[2] = i_src[dorder.at(2)]; + i_dst[3] = i_src[dorder.at(3)]; + + dst_index = i_dst[0] * dst_stride[0] + i_dst[1] * dst_stride[1] + + i_dst[2] * dst_stride[2] + i_dst[3] * dst_stride[3]; + dst[dst_index] = src[src_index]; + } + } + } +} + +template +inline __device__ void general_4d_reorder_2x256(T* dst, + T* src, + uint32_t dim_0, + uint32_t dim_1, + uint32_t dim_2, + uint32_t dim_3, + uint32_t dim_stride, + uint32_t dim_total, + uint32_t magic_stride0, + uint32_t shift_stride0, + uint32_t magic_stride1, + uint32_t shift_stride1, + uint32_t magic_stride2, + uint32_t shift_stride2) +{ + constexpr auto dorder = dst_order{}; + uint32_t pixel_total = dim_0 * dim_1 * dim_2 * dim_3; + uint32_t src_index, dst_index; + const uint64_t src_dim[4] = {dim_0, dim_1, dim_2, dim_3}; + const uint64_t dst_dim[4] = { + src_dim[dorder.at(0)], src_dim[dorder.at(1)], src_dim[dorder.at(2)], src_dim[dorder.at(3)]}; + const uint64_t src_stride[4] = { + src_dim[1] * src_dim[2] * src_dim[3], src_dim[2] * src_dim[3], src_dim[3], 1}; + const uint64_t dst_stride[4] = { + dst_dim[1] * dst_dim[2] * dst_dim[3], dst_dim[2] * dst_dim[3], dst_dim[3], 1}; + + uint32_t i_src[4] = {0, 0, 0, 0}; + uint32_t i_dst[4] = {0, 0, 0, 0}; + + for(uint32_t dim_id = blockIdx.x; dim_id < dim_total; dim_id += dim_stride) + { + for(uint32_t k = 0; k < 2; k++) + { + // unroll k block thread + src_index = k * dim_total * 256 + dim_id * 256 + threadIdx.x; + if(src_index < pixel_total) + { + i_src[0] = magic_div_u32(src_index, magic_stride0, shift_stride0); + i_src[1] = magic_div_u32( + src_index - i_src[0] * src_stride[0], magic_stride1, shift_stride1); + i_src[2] = + magic_div_u32(src_index - i_src[0] * src_stride[0] - i_src[1] * src_stride[1], + magic_stride2, + shift_stride2); + i_src[3] = src_index - i_src[0] * src_stride[0] - i_src[1] * src_stride[1] - + i_src[2] * src_stride[2]; + + i_dst[0] = i_src[dorder.at(0)]; + i_dst[1] = i_src[dorder.at(1)]; + i_dst[2] = i_src[dorder.at(2)]; + i_dst[3] = i_src[dorder.at(3)]; + + dst_index = i_dst[0] * dst_stride[0] + i_dst[1] * dst_stride[1] + + i_dst[2] * dst_stride[2] + i_dst[3] * dst_stride[3]; + dst[dst_index] = src[src_index]; + } + } + } +} + +template +inline __device__ void general_4d_reorder_4x256(T* dst, + T* src, + uint32_t dim_0, + uint32_t dim_1, + uint32_t dim_2, + uint32_t dim_3, + uint32_t dim_stride, + uint32_t dim_total, + uint32_t magic_stride0, + uint32_t shift_stride0, + uint32_t magic_stride1, + uint32_t shift_stride1, + uint32_t magic_stride2, + uint32_t shift_stride2) +{ + constexpr auto dorder = dst_order{}; + uint32_t pixel_total = dim_0 * dim_1 * dim_2 * dim_3; + uint32_t src_index, dst_index; + const uint64_t src_dim[4] = {dim_0, dim_1, dim_2, dim_3}; + const uint64_t dst_dim[4] = { + src_dim[dorder.at(0)], src_dim[dorder.at(1)], src_dim[dorder.at(2)], src_dim[dorder.at(3)]}; + const uint64_t src_stride[4] = { + src_dim[1] * src_dim[2] * src_dim[3], src_dim[2] * src_dim[3], src_dim[3], 1}; + const uint64_t dst_stride[4] = { + dst_dim[1] * dst_dim[2] * dst_dim[3], dst_dim[2] * dst_dim[3], dst_dim[3], 1}; + + uint32_t i_src[4] = {0, 0, 0, 0}; + uint32_t i_dst[4] = {0, 0, 0, 0}; + + for(uint32_t dim_id = blockIdx.x; dim_id < dim_total; dim_id += dim_stride) + { + for(uint32_t k = 0; k < 4; k++) + { + // unroll k block thread + src_index = k * dim_total * 256 + dim_id * 256 + threadIdx.x; + if(src_index < pixel_total) + { + i_src[0] = magic_div_u32(src_index, magic_stride0, shift_stride0); + i_src[1] = magic_div_u32( + src_index - i_src[0] * src_stride[0], magic_stride1, shift_stride1); + i_src[2] = + magic_div_u32(src_index - i_src[0] * src_stride[0] - i_src[1] * src_stride[1], + magic_stride2, + shift_stride2); + i_src[3] = src_index - i_src[0] * src_stride[0] - i_src[1] * src_stride[1] - + i_src[2] * src_stride[2]; + + i_dst[0] = i_src[dorder.at(0)]; + i_dst[1] = i_src[dorder.at(1)]; + i_dst[2] = i_src[dorder.at(2)]; + i_dst[3] = i_src[dorder.at(3)]; + + dst_index = i_dst[0] * dst_stride[0] + i_dst[1] * dst_stride[1] + + i_dst[2] * dst_stride[2] + i_dst[3] * dst_stride[3]; + dst[dst_index] = src[src_index]; + } + } + } +} + +template +inline __device__ void general_4d_reorder_8x256(T* dst, + T* src, + uint32_t dim_0, + uint32_t dim_1, + uint32_t dim_2, + uint32_t dim_3, + uint32_t dim_stride, + uint32_t dim_total, + uint32_t magic_stride0, + uint32_t shift_stride0, + uint32_t magic_stride1, + uint32_t shift_stride1, + uint32_t magic_stride2, + uint32_t shift_stride2) +{ + constexpr auto dorder = dst_order{}; + uint32_t pixel_total = dim_0 * dim_1 * dim_2 * dim_3; + uint32_t src_index, dst_index; + const uint64_t src_dim[4] = {dim_0, dim_1, dim_2, dim_3}; + const uint64_t dst_dim[4] = { + src_dim[dorder.at(0)], src_dim[dorder.at(1)], src_dim[dorder.at(2)], src_dim[dorder.at(3)]}; + const uint64_t src_stride[4] = { + src_dim[1] * src_dim[2] * src_dim[3], src_dim[2] * src_dim[3], src_dim[3], 1}; + const uint64_t dst_stride[4] = { + dst_dim[1] * dst_dim[2] * dst_dim[3], dst_dim[2] * dst_dim[3], dst_dim[3], 1}; + + uint32_t i_src[4] = {0, 0, 0, 0}; + uint32_t i_dst[4] = {0, 0, 0, 0}; + + for(uint32_t dim_id = blockIdx.x; dim_id < dim_total; dim_id += dim_stride) + { + for(uint32_t k = 0; k < 8; k++) + { + // unroll k block thread + src_index = k * dim_total * 256 + dim_id * 256 + threadIdx.x; + if(src_index < pixel_total) + { + i_src[0] = magic_div_u32(src_index, magic_stride0, shift_stride0); + i_src[1] = magic_div_u32( + src_index - i_src[0] * src_stride[0], magic_stride1, shift_stride1); + i_src[2] = + magic_div_u32(src_index - i_src[0] * src_stride[0] - i_src[1] * src_stride[1], + magic_stride2, + shift_stride2); + i_src[3] = src_index - i_src[0] * src_stride[0] - i_src[1] * src_stride[1] - + i_src[2] * src_stride[2]; + + i_dst[0] = i_src[dorder.at(0)]; + i_dst[1] = i_src[dorder.at(1)]; + i_dst[2] = i_src[dorder.at(2)]; + i_dst[3] = i_src[dorder.at(3)]; + + dst_index = i_dst[0] * dst_stride[0] + i_dst[1] * dst_stride[1] + + i_dst[2] * dst_stride[2] + i_dst[3] * dst_stride[3]; + dst[dst_index] = src[src_index]; + } + } + } +} + +template +inline __device__ void general_4d_reorder_16x256(T* dst, + T* src, + uint32_t dim_0, + uint32_t dim_1, + uint32_t dim_2, + uint32_t dim_3, + uint32_t dim_stride, + uint32_t dim_total, + uint32_t magic_stride0, + uint32_t shift_stride0, + uint32_t magic_stride1, + uint32_t shift_stride1, + uint32_t magic_stride2, + uint32_t shift_stride2) +{ + constexpr auto dorder = dst_order{}; + uint32_t pixel_total = dim_0 * dim_1 * dim_2 * dim_3; + uint32_t src_index, dst_index; + const uint64_t src_dim[4] = {dim_0, dim_1, dim_2, dim_3}; + const uint64_t dst_dim[4] = { + src_dim[dorder.at(0)], src_dim[dorder.at(1)], src_dim[dorder.at(2)], src_dim[dorder.at(3)]}; + const uint64_t src_stride[4] = { + src_dim[1] * src_dim[2] * src_dim[3], src_dim[2] * src_dim[3], src_dim[3], 1}; + const uint64_t dst_stride[4] = { + dst_dim[1] * dst_dim[2] * dst_dim[3], dst_dim[2] * dst_dim[3], dst_dim[3], 1}; + + uint32_t i_src[4] = {0, 0, 0, 0}; + uint32_t i_dst[4] = {0, 0, 0, 0}; + + for(uint32_t dim_id = blockIdx.x; dim_id < dim_total; dim_id += dim_stride) + { + for(uint32_t k = 0; k < 16; k++) + { + // unroll k block thread + src_index = k * dim_total * 256 + dim_id * 256 + threadIdx.x; + if(src_index < pixel_total) + { + i_src[0] = magic_div_u32(src_index, magic_stride0, shift_stride0); + i_src[1] = magic_div_u32( + src_index - i_src[0] * src_stride[0], magic_stride1, shift_stride1); + i_src[2] = + magic_div_u32(src_index - i_src[0] * src_stride[0] - i_src[1] * src_stride[1], + magic_stride2, + shift_stride2); + i_src[3] = src_index - i_src[0] * src_stride[0] - i_src[1] * src_stride[1] - + i_src[2] * src_stride[2]; + + i_dst[0] = i_src[dorder.at(0)]; + i_dst[1] = i_src[dorder.at(1)]; + i_dst[2] = i_src[dorder.at(2)]; + i_dst[3] = i_src[dorder.at(3)]; + + dst_index = i_dst[0] * dst_stride[0] + i_dst[1] * dst_stride[1] + + i_dst[2] * dst_stride[2] + i_dst[3] * dst_stride[3]; + dst[dst_index] = src[src_index]; + } + } + } +} + +#define DEFINE_GENERAL_4D_REORDER_KERNEL(tile_trait, \ + dst_order, \ + accept_data_type, \ + cast_data_type, \ + lb_threads_per_block, \ + lb_blocks_per_cu) \ + extern "C" __global__ void __launch_bounds__(lb_threads_per_block, lb_blocks_per_cu) \ + general_4d_reorder_##tile_trait##_##accept_data_type##_##dst_order(void* dst, \ + void* src, \ + uint32_t dim_0, \ + uint32_t dim_1, \ + uint32_t dim_2, \ + uint32_t dim_3, \ + uint32_t dim_stride, \ + uint32_t dim_total, \ + uint32_t magic_stride0, \ + uint32_t shift_stride0, \ + uint32_t magic_stride1, \ + uint32_t shift_stride1, \ + uint32_t magic_stride2, \ + uint32_t shift_stride2) \ + { \ + general_4d_reorder_##tile_trait( \ + reinterpret_cast(dst), \ + reinterpret_cast(src), \ + dim_0, \ + dim_1, \ + dim_2, \ + dim_3, \ + dim_stride, \ + dim_total, \ + magic_stride0, \ + shift_stride0, \ + magic_stride1, \ + shift_stride1, \ + magic_stride2, \ + shift_stride2); \ + } +// default order is 0 1 2 3 +using r0132 = order<0, 1, 3, 2>; +using r0213 = order<0, 2, 1, 3>; // nhwc2nchwc +using r0231 = order<0, 2, 3, 1>; // nchw2nchwc +using r0312 = order<0, 3, 1, 2>; // nhwc2nchw +using r0321 = order<0, 3, 2, 1>; +using r1023 = order<1, 0, 2, 3>; +using r1032 = order<1, 0, 3, 2>; +using r1203 = order<1, 2, 0, 3>; +using r1230 = order<1, 2, 3, 0>; +using r1302 = order<1, 3, 0, 2>; // nchw2chwnc +using r1320 = order<1, 3, 2, 0>; +using r2013 = order<2, 0, 1, 3>; +using r2031 = order<2, 0, 3, 1>; +using r2103 = order<2, 1, 0, 3>; // nhwc2chwnc +using r2130 = order<2, 1, 3, 0>; +using r2301 = order<2, 3, 0, 1>; +using r2310 = order<2, 3, 1, 0>; +using r3012 = order<3, 0, 1, 2>; +using r3021 = order<3, 0, 2, 1>; +using r3102 = order<3, 1, 0, 2>; +using r3120 = order<3, 1, 2, 0>; +using r3201 = order<3, 2, 0, 1>; +using r3210 = order<3, 2, 1, 0>; +#endif diff --git a/src/kernels/gpu_general_tensor_reorder_kernel/order.hpp b/src/kernels/gpu_general_tensor_reorder_kernel/order.hpp deleted file mode 100644 index c8e80f7e7f..0000000000 --- a/src/kernels/gpu_general_tensor_reorder_kernel/order.hpp +++ /dev/null @@ -1,43 +0,0 @@ -/******************************************************************************* - * - * MIT License - * - * Copyright (c) 2020-2022 Advanced Micro Devices, Inc. - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - *******************************************************************************/ -#include -#ifndef ORDER_HPP -#define ORDER_HPP - -template -struct order -{ - static constexpr std::size_t m_size = sizeof...(Is); - // the last dummy element is to prevent compiler complain about empty array, when mSize = 0 - static constexpr int m_data[m_size + 1] = {Is..., 0}; - - __host__ __device__ static constexpr uint64_t size() { return m_size; } - - __host__ __device__ static constexpr uint64_t get_size() { return size(); } - - __host__ __device__ static constexpr int at(int I) { return m_data[I]; } -}; -#endif diff --git a/test/tensor_reorder.cpp b/test/tensor_reorder.cpp index d1e2e1d1ce..452dd1b570 100644 --- a/test/tensor_reorder.cpp +++ b/test/tensor_reorder.cpp @@ -314,9 +314,9 @@ struct tensor_reorder_base_driver : test_driver } ~tensor_reorder_base_driver() { miopenDestroy(handle); } - static std::vector get_dim_3_size() { return {1, 9, 14}; } - static std::vector get_dim_2_size() { return {1, 9, 14}; } - static std::vector get_dim_1_size() { return {3, 8, 14}; } + static std::vector get_dim_3_size() { return {1, 9}; } + static std::vector get_dim_2_size() { return {1, 9}; } + static std::vector get_dim_1_size() { return {3, 8}; } static std::vector get_dim_0_size() { return {1, 2}; } template @@ -504,4 +504,42 @@ struct tensor_reorder_driver : tensor_reorder_base_driver } }; -int main(int argc, const char* argv[]) { test_drive(argc, argv); } +template