From 8d205b4458bbc5bd040f52764ed25194e23798ff Mon Sep 17 00:00:00 2001
From: JD <jahandad@gmail.com>
Date: Mon, 11 Apr 2022 01:03:26 -0500
Subject: [PATCH 1/7] Update MIOpen Version to 2.17 (#1511)

---
 CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index f4d2abd648..18a059ace9 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -86,7 +86,7 @@ if(NOT WIN32 AND NOT APPLE)
     set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -s")
 endif()
 
-rocm_setup_version(VERSION 2.16.0)
+rocm_setup_version(VERSION 2.17.0)
 
 list( APPEND CMAKE_MODULE_PATH ${PROJECT_SOURCE_DIR}/cmake )
 include(TargetFlags)

From 1aa63be0c5ae053f24e0ba1efa1fb720e8e00f30 Mon Sep 17 00:00:00 2001
From: Murali N <33875998+muralinr@users.noreply.github.com>
Date: Sun, 10 Apr 2022 23:05:17 -0700
Subject: [PATCH 2/7] [RNN][Op2dTensorSquash] Tensor squash kernel
 optimizations (#1491)

---
 src/kernels/MIOpenTensorKernels.cl | 139 +++++++++++++++++++++++++++--
 1 file changed, 131 insertions(+), 8 deletions(-)

diff --git a/src/kernels/MIOpenTensorKernels.cl b/src/kernels/MIOpenTensorKernels.cl
index f95b65a21f..f684ce0e82 100644
--- a/src/kernels/MIOpenTensorKernels.cl
+++ b/src/kernels/MIOpenTensorKernels.cl
@@ -635,12 +635,43 @@ __kernel void Op2dTensorSquash(const global MIOPEN_TYPE* a,
                                const int use_bet)
 {
     MIOPEN_TYPE a_dat[RD_BLCK];
-    MIOPEN_TYPE b_dat[RD_BLCK];
+    MIOPEN_TYPE b_dat1[RD_BLCK];
+    MIOPEN_TYPE b_dat2[RD_BLCK];
+    MIOPEN_TYPE b_dat3[RD_BLCK];
+    MIOPEN_TYPE b_dat4[RD_BLCK];
+    MIOPEN_TYPE b_dat5[RD_BLCK];
+    MIOPEN_TYPE b_dat6[RD_BLCK];
+    MIOPEN_TYPE b_dat7[RD_BLCK];
+    MIOPEN_TYPE b_dat8[RD_BLCK];
+    MIOPEN_TYPE b_dat9[RD_BLCK];
+    MIOPEN_TYPE b_dat10[RD_BLCK];
+    MIOPEN_TYPE b_dat11[RD_BLCK];
+    MIOPEN_TYPE b_dat12[RD_BLCK];
+    MIOPEN_TYPE b_dat13[RD_BLCK];
+    MIOPEN_TYPE b_dat14[RD_BLCK];
+    MIOPEN_TYPE b_dat15[RD_BLCK];
+    MIOPEN_TYPE b_dat16[RD_BLCK];
     MIOPEN_TYPE c_dat[RD_BLCK];
+    int g_RD_BLCK;
 
     for(int i = 0; i < RD_BLCK; ++i)
     {
-        b_dat[i] = (MIOPEN_TYPE)0;
+        b_dat1[i]  = (MIOPEN_TYPE)0;
+        b_dat2[i]  = (MIOPEN_TYPE)0;
+        b_dat3[i]  = (MIOPEN_TYPE)0;
+        b_dat4[i]  = (MIOPEN_TYPE)0;
+        b_dat5[i]  = (MIOPEN_TYPE)0;
+        b_dat6[i]  = (MIOPEN_TYPE)0;
+        b_dat7[i]  = (MIOPEN_TYPE)0;
+        b_dat8[i]  = (MIOPEN_TYPE)0;
+        b_dat9[i]  = (MIOPEN_TYPE)0;
+        b_dat10[i] = (MIOPEN_TYPE)0;
+        b_dat11[i] = (MIOPEN_TYPE)0;
+        b_dat12[i] = (MIOPEN_TYPE)0;
+        b_dat13[i] = (MIOPEN_TYPE)0;
+        b_dat14[i] = (MIOPEN_TYPE)0;
+        b_dat15[i] = (MIOPEN_TYPE)0;
+        b_dat16[i] = (MIOPEN_TYPE)0;
     }
 
     for(int gid = get_global_id(0); gid < total_work; gid += get_global_size(0))
@@ -670,16 +701,108 @@ __kernel void Op2dTensorSquash(const global MIOPEN_TYPE* a,
             }
         }
 
-        for(int bid = 0; bid < b_c; bid++)
+        g_RD_BLCK = gid * RD_BLCK;
+        if(use_apl1 == 1)
         {
-            if(use_apl1 == 1)
+            for(int bid = 0; bid < ((b_c / 16) * 16); bid += 16)
             {
-                int b_index          = bid * b_nstride + gid * RD_BLCK;
-                *((READ_TYPE*)b_dat) = *((const global READ_TYPE*)(b + Boffset + b_index));
+
+                int b_index1           = (bid * b_nstride) + g_RD_BLCK;
+                int b_index2           = ((bid + 1) * b_nstride) + g_RD_BLCK;
+                int b_index3           = ((bid + 2) * b_nstride) + g_RD_BLCK;
+                int b_index4           = ((bid + 3) * b_nstride) + g_RD_BLCK;
+                int b_index5           = ((bid + 4) * b_nstride) + g_RD_BLCK;
+                int b_index6           = ((bid + 5) * b_nstride) + g_RD_BLCK;
+                int b_index7           = ((bid + 6) * b_nstride) + g_RD_BLCK;
+                int b_index8           = ((bid + 7) * b_nstride) + g_RD_BLCK;
+                int b_index9           = ((bid + 8) * b_nstride) + g_RD_BLCK;
+                int b_index10          = ((bid + 9) * b_nstride) + g_RD_BLCK;
+                int b_index11          = ((bid + 10) * b_nstride) + g_RD_BLCK;
+                int b_index12          = ((bid + 11) * b_nstride) + g_RD_BLCK;
+                int b_index13          = ((bid + 12) * b_nstride) + g_RD_BLCK;
+                int b_index14          = ((bid + 13) * b_nstride) + g_RD_BLCK;
+                int b_index15          = ((bid + 14) * b_nstride) + g_RD_BLCK;
+                int b_index16          = ((bid + 15) * b_nstride) + g_RD_BLCK;
+                *((READ_TYPE*)b_dat1)  = *((const global READ_TYPE*)(b + Boffset + b_index1));
+                *((READ_TYPE*)b_dat2)  = *((const global READ_TYPE*)(b + Boffset + b_index2));
+                *((READ_TYPE*)b_dat3)  = *((const global READ_TYPE*)(b + Boffset + b_index3));
+                *((READ_TYPE*)b_dat4)  = *((const global READ_TYPE*)(b + Boffset + b_index4));
+                *((READ_TYPE*)b_dat5)  = *((const global READ_TYPE*)(b + Boffset + b_index5));
+                *((READ_TYPE*)b_dat6)  = *((const global READ_TYPE*)(b + Boffset + b_index6));
+                *((READ_TYPE*)b_dat7)  = *((const global READ_TYPE*)(b + Boffset + b_index7));
+                *((READ_TYPE*)b_dat8)  = *((const global READ_TYPE*)(b + Boffset + b_index8));
+                *((READ_TYPE*)b_dat9)  = *((const global READ_TYPE*)(b + Boffset + b_index9));
+                *((READ_TYPE*)b_dat10) = *((const global READ_TYPE*)(b + Boffset + b_index10));
+                *((READ_TYPE*)b_dat11) = *((const global READ_TYPE*)(b + Boffset + b_index11));
+                *((READ_TYPE*)b_dat12) = *((const global READ_TYPE*)(b + Boffset + b_index12));
+                *((READ_TYPE*)b_dat13) = *((const global READ_TYPE*)(b + Boffset + b_index13));
+                *((READ_TYPE*)b_dat14) = *((const global READ_TYPE*)(b + Boffset + b_index14));
+                *((READ_TYPE*)b_dat15) = *((const global READ_TYPE*)(b + Boffset + b_index15));
+                *((READ_TYPE*)b_dat16) = *((const global READ_TYPE*)(b + Boffset + b_index16));
+
+                for(int i = 0; i < RD_BLCK; ++i)
+                {
+                    c_dat[i] += MIOPEN_TENSOR_OP(a_dat[i], b_dat1[i] * alpha1);
+                    c_dat[i] += MIOPEN_TENSOR_OP(a_dat[i], b_dat2[i] * alpha1);
+                    c_dat[i] += MIOPEN_TENSOR_OP(a_dat[i], b_dat3[i] * alpha1);
+                    c_dat[i] += MIOPEN_TENSOR_OP(a_dat[i], b_dat4[i] * alpha1);
+                    c_dat[i] += MIOPEN_TENSOR_OP(a_dat[i], b_dat5[i] * alpha1);
+                    c_dat[i] += MIOPEN_TENSOR_OP(a_dat[i], b_dat6[i] * alpha1);
+                    c_dat[i] += MIOPEN_TENSOR_OP(a_dat[i], b_dat7[i] * alpha1);
+                    c_dat[i] += MIOPEN_TENSOR_OP(a_dat[i], b_dat8[i] * alpha1);
+                    c_dat[i] += MIOPEN_TENSOR_OP(a_dat[i], b_dat9[i] * alpha1);
+                    c_dat[i] += MIOPEN_TENSOR_OP(a_dat[i], b_dat10[i] * alpha1);
+                    c_dat[i] += MIOPEN_TENSOR_OP(a_dat[i], b_dat11[i] * alpha1);
+                    c_dat[i] += MIOPEN_TENSOR_OP(a_dat[i], b_dat12[i] * alpha1);
+                    c_dat[i] += MIOPEN_TENSOR_OP(a_dat[i], b_dat13[i] * alpha1);
+                    c_dat[i] += MIOPEN_TENSOR_OP(a_dat[i], b_dat14[i] * alpha1);
+                    c_dat[i] += MIOPEN_TENSOR_OP(a_dat[i], b_dat15[i] * alpha1);
+                    c_dat[i] += MIOPEN_TENSOR_OP(a_dat[i], b_dat16[i] * alpha1);
+                }
             }
-            for(int i = 0; i < RD_BLCK; ++i)
+            for(int bid = ((b_c / 16) * 16); bid < b_c; bid++)
             {
-                c_dat[i] += MIOPEN_TENSOR_OP(a_dat[i], b_dat[i] * alpha1);
+                int b_index           = bid * b_nstride + g_RD_BLCK;
+                *((READ_TYPE*)b_dat1) = *((const global READ_TYPE*)(b + Boffset + b_index));
+
+                for(int i = 0; i < RD_BLCK; ++i)
+                {
+                    c_dat[i] += MIOPEN_TENSOR_OP(a_dat[i], b_dat1[i] * alpha1);
+                }
+            }
+        }
+        else
+        {
+            for(int bid = 0; bid < ((b_c / 16) * 16); bid += 16)
+            {
+
+                for(int i = 0; i < RD_BLCK; ++i)
+                {
+                    c_dat[i] += MIOPEN_TENSOR_OP(a_dat[i], 0);
+                    c_dat[i] += MIOPEN_TENSOR_OP(a_dat[i], 0);
+                    c_dat[i] += MIOPEN_TENSOR_OP(a_dat[i], 0);
+                    c_dat[i] += MIOPEN_TENSOR_OP(a_dat[i], 0);
+                    c_dat[i] += MIOPEN_TENSOR_OP(a_dat[i], 0);
+                    c_dat[i] += MIOPEN_TENSOR_OP(a_dat[i], 0);
+                    c_dat[i] += MIOPEN_TENSOR_OP(a_dat[i], 0);
+                    c_dat[i] += MIOPEN_TENSOR_OP(a_dat[i], 0);
+                    c_dat[i] += MIOPEN_TENSOR_OP(a_dat[i], 0);
+                    c_dat[i] += MIOPEN_TENSOR_OP(a_dat[i], 0);
+                    c_dat[i] += MIOPEN_TENSOR_OP(a_dat[i], 0);
+                    c_dat[i] += MIOPEN_TENSOR_OP(a_dat[i], 0);
+                    c_dat[i] += MIOPEN_TENSOR_OP(a_dat[i], 0);
+                    c_dat[i] += MIOPEN_TENSOR_OP(a_dat[i], 0);
+                    c_dat[i] += MIOPEN_TENSOR_OP(a_dat[i], 0);
+                    c_dat[i] += MIOPEN_TENSOR_OP(a_dat[i], 0);
+                }
+            }
+            for(int bid = ((b_c / 16) * 16); bid < b_c; bid++)
+            {
+
+                for(int i = 0; i < RD_BLCK; ++i)
+                {
+                    c_dat[i] += MIOPEN_TENSOR_OP(a_dat[i], 0);
+                }
             }
         }
         *((global READ_TYPE*)(c + Coffset + io_index)) = *((READ_TYPE*)c_dat);

From 9ae2418adf767794e9475274a4cf90e418f00a58 Mon Sep 17 00:00:00 2001
From: arvindcheru <90783369+arvindcheru@users.noreply.github.com>
Date: Mon, 11 Apr 2022 02:06:51 -0400
Subject: [PATCH 3/7] Fix Warning - Update rocBLAS re-org PATH (#1509)

---
 src/gemm_v2.cpp               | 2 +-
 src/include/miopen/handle.hpp | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/gemm_v2.cpp b/src/gemm_v2.cpp
index fc8b3222c0..f95337dc0f 100644
--- a/src/gemm_v2.cpp
+++ b/src/gemm_v2.cpp
@@ -40,7 +40,7 @@
 
 #if MIOPEN_USE_ROCBLAS
 #include <half.hpp>
-#include <rocblas.h>
+#include <rocblas/rocblas.h>
 #include <miopen/perf_field.hpp>
 #endif
 
diff --git a/src/include/miopen/handle.hpp b/src/include/miopen/handle.hpp
index ee45ae2a0b..b99883f5f8 100644
--- a/src/include/miopen/handle.hpp
+++ b/src/include/miopen/handle.hpp
@@ -52,7 +52,7 @@
 
 #if MIOPEN_USE_ROCBLAS
 #include <miopen/manage_ptr.hpp>
-#include <rocblas.h>
+#include <rocblas/rocblas.h>
 #endif
 
 namespace miopen {

From 0a9c2fd2f417659fc7df572e90c67d31c78ff057 Mon Sep 17 00:00:00 2001
From: Jun Liu <Liu.Jun@amd.com>
Date: Sun, 10 Apr 2022 23:55:57 -0700
Subject: [PATCH 4/7] Revert "Fix Warning - Update rocBLAS re-org PATH (#1509)"

This reverts commit 9ae2418adf767794e9475274a4cf90e418f00a58.
---
 src/gemm_v2.cpp               | 2 +-
 src/include/miopen/handle.hpp | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/gemm_v2.cpp b/src/gemm_v2.cpp
index f95337dc0f..fc8b3222c0 100644
--- a/src/gemm_v2.cpp
+++ b/src/gemm_v2.cpp
@@ -40,7 +40,7 @@
 
 #if MIOPEN_USE_ROCBLAS
 #include <half.hpp>
-#include <rocblas/rocblas.h>
+#include <rocblas.h>
 #include <miopen/perf_field.hpp>
 #endif
 
diff --git a/src/include/miopen/handle.hpp b/src/include/miopen/handle.hpp
index b99883f5f8..ee45ae2a0b 100644
--- a/src/include/miopen/handle.hpp
+++ b/src/include/miopen/handle.hpp
@@ -52,7 +52,7 @@
 
 #if MIOPEN_USE_ROCBLAS
 #include <miopen/manage_ptr.hpp>
-#include <rocblas/rocblas.h>
+#include <rocblas.h>
 #endif
 
 namespace miopen {

From 9aafc24dd4256371f2dc32088f599f66a0c3d4cf Mon Sep 17 00:00:00 2001
From: Zhuoran Yin <zhuoryin@amd.com>
Date: Mon, 11 Apr 2022 13:44:12 -0400
Subject: [PATCH 5/7] [MLIR] Querying all sub-kernels applicability (#1505)

---
 src/solver/conv_mlir_igemm_fwd.cpp        | 10 +++++++++-
 src/solver/conv_mlir_igemm_fwd_xdlops.cpp | 10 +++++++++-
 2 files changed, 18 insertions(+), 2 deletions(-)

diff --git a/src/solver/conv_mlir_igemm_fwd.cpp b/src/solver/conv_mlir_igemm_fwd.cpp
index 2c8c75c3f4..f89d430707 100644
--- a/src/solver/conv_mlir_igemm_fwd.cpp
+++ b/src/solver/conv_mlir_igemm_fwd.cpp
@@ -91,7 +91,15 @@ bool PerformanceConvMlirIgemm::IsValid(const ConvolutionContext& ctx) const
     if(*this == MlirHeuristicInitRequest())
         return true;
 
-    return MiirIsConfigApplicable(mlir::ConstructBuildOptions(ctx, *this, false));
+    int kernel_count = MiirGetKernelCount(mlir::ConstructBuildOptions(ctx, false));
+    bool isValid     = false;
+    for(int kernel_id = 0; kernel_id < kernel_count; ++kernel_id)
+    {
+        isValid = MiirIsConfigApplicable(mlir::ConstructBuildOptions(ctx, *this, false, kernel_id));
+        if(!isValid)
+            return false;
+    }
+    return isValid;
 #else
     std::ignore = ctx;
     return false;
diff --git a/src/solver/conv_mlir_igemm_fwd_xdlops.cpp b/src/solver/conv_mlir_igemm_fwd_xdlops.cpp
index eb2c8d04d0..3798849f93 100644
--- a/src/solver/conv_mlir_igemm_fwd_xdlops.cpp
+++ b/src/solver/conv_mlir_igemm_fwd_xdlops.cpp
@@ -127,7 +127,15 @@ bool PerformanceConvMlirIgemmXdlops::IsValid(const ConvolutionContext& ctx) cons
     if(*this == MlirHeuristicInitRequest())
         return true;
 
-    return MiirIsConfigApplicable(mlir::ConstructBuildOptions(ctx, *this, true));
+    int kernel_count = MiirGetKernelCount(mlir::ConstructBuildOptions(ctx, true));
+    bool isValid     = false;
+    for(int kernel_id = 0; kernel_id < kernel_count; ++kernel_id)
+    {
+        isValid = MiirIsConfigApplicable(mlir::ConstructBuildOptions(ctx, *this, true, kernel_id));
+        if(!isValid)
+            return false;
+    }
+    return isValid;
 #else
     std::ignore = ctx;
     return false;

From f6575a69cdcbbf2e767b39c2a876e240625fc84e Mon Sep 17 00:00:00 2001
From: arvindcheru <90783369+arvindcheru@users.noreply.github.com>
Date: Mon, 11 Apr 2022 16:22:35 -0400
Subject: [PATCH 6/7] Updated rocm cmake version requirement  (#1514)

---
 dev-requirements.txt | 2 +-
 requirements.txt     | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/dev-requirements.txt b/dev-requirements.txt
index 160774795f..26e3737100 100755
--- a/dev-requirements.txt
+++ b/dev-requirements.txt
@@ -1,5 +1,5 @@
 ROCmSoftwarePlatform/rocm-recipes
-RadeonOpenCompute/rocm-cmake@4a1514850195360ce772182d34b6b5afd8f37253 --build
+RadeonOpenCompute/rocm-cmake@bd4e360c73fa366f817f5aa013433e799d8cd659 --build
 -f requirements.txt
 # 1.90+
 danmar/cppcheck@dd05839a7e63ef04afd34711cb3e1e0ef742882f
diff --git a/requirements.txt b/requirements.txt
index 0747183a26..c5bc5fbd18 100755
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,4 +1,4 @@
-RadeonOpenCompute/rocm-cmake@4a1514850195360ce772182d34b6b5afd8f37253 --build
+RadeonOpenCompute/rocm-cmake@bd4e360c73fa366f817f5aa013433e799d8cd659 --build
 sqlite3@3.17 -DCMAKE_POSITION_INDEPENDENT_CODE=On
 boost@1.72 -DCMAKE_POSITION_INDEPENDENT_CODE=On --build
 half,https://github.com/pfultz2/half/archive/1.12.0.tar.gz -X header -H sha256:0a08660b68abb176ebc2a0cdf8de46e3182a7f46c66443bb80dbfaaec98cf969 --build

From 3d16ede87b9cc504ab5f6d9798150f3ec0197f5e Mon Sep 17 00:00:00 2001
From: Haocong WANG <haocwang@amd.com>
Date: Tue, 12 Apr 2022 04:25:36 +0800
Subject: [PATCH 7/7] [Tensor reorder][Quality][#issue 1476] Split kernel file
 & resolve unsolved issues (#1515)

---
 src/hip/general_tensor_reorder_sol.cpp        |  22 +-
 .../miopen/general_tensor_reorder_sol.hpp     |   1 +
 src/include/miopen/tensor_reorder_util.hpp    |  38 +-
 .../general_tensor_reorder.cpp                | 890 ------------------
 .../general_tensor_reorder_16x256_byte.cpp    |  49 +
 .../general_tensor_reorder_16x256_dword.cpp   |  49 +
 .../general_tensor_reorder_16x256_dwordx2.cpp |  49 +
 .../general_tensor_reorder_16x256_half.cpp    |  49 +
 .../general_tensor_reorder_1x256_byte.cpp     |  49 +
 .../general_tensor_reorder_1x256_dword.cpp    |  49 +
 .../general_tensor_reorder_1x256_dwordx2.cpp  |  49 +
 .../general_tensor_reorder_1x256_half.cpp     |  49 +
 .../general_tensor_reorder_2x256_byte.cpp     |  49 +
 .../general_tensor_reorder_2x256_dword.cpp    |  49 +
 .../general_tensor_reorder_2x256_dwordx2.cpp  |  49 +
 .../general_tensor_reorder_2x256_half.cpp     |  49 +
 .../general_tensor_reorder_4x256_byte.cpp     |  49 +
 .../general_tensor_reorder_4x256_dword.cpp    |  49 +
 .../general_tensor_reorder_4x256_dwordx2.cpp  |  49 +
 .../general_tensor_reorder_4x256_half.cpp     |  49 +
 .../general_tensor_reorder_8x256_byte.cpp     |  49 +
 .../general_tensor_reorder_8x256_dword.cpp    |  49 +
 .../general_tensor_reorder_8x256_dwordx2.cpp  |  49 +
 .../general_tensor_reorder_8x256_half.cpp     |  49 +
 .../general_tensor_reorder_kernel_util.hpp    | 428 +++++++++
 .../order.hpp                                 |  43 -
 test/tensor_reorder.cpp                       |  46 +-
 27 files changed, 1501 insertions(+), 947 deletions(-)
 delete mode 100644 src/kernels/gpu_general_tensor_reorder_kernel/general_tensor_reorder.cpp
 create mode 100644 src/kernels/gpu_general_tensor_reorder_kernel/general_tensor_reorder_16x256_byte.cpp
 create mode 100644 src/kernels/gpu_general_tensor_reorder_kernel/general_tensor_reorder_16x256_dword.cpp
 create mode 100644 src/kernels/gpu_general_tensor_reorder_kernel/general_tensor_reorder_16x256_dwordx2.cpp
 create mode 100644 src/kernels/gpu_general_tensor_reorder_kernel/general_tensor_reorder_16x256_half.cpp
 create mode 100644 src/kernels/gpu_general_tensor_reorder_kernel/general_tensor_reorder_1x256_byte.cpp
 create mode 100644 src/kernels/gpu_general_tensor_reorder_kernel/general_tensor_reorder_1x256_dword.cpp
 create mode 100644 src/kernels/gpu_general_tensor_reorder_kernel/general_tensor_reorder_1x256_dwordx2.cpp
 create mode 100644 src/kernels/gpu_general_tensor_reorder_kernel/general_tensor_reorder_1x256_half.cpp
 create mode 100644 src/kernels/gpu_general_tensor_reorder_kernel/general_tensor_reorder_2x256_byte.cpp
 create mode 100644 src/kernels/gpu_general_tensor_reorder_kernel/general_tensor_reorder_2x256_dword.cpp
 create mode 100644 src/kernels/gpu_general_tensor_reorder_kernel/general_tensor_reorder_2x256_dwordx2.cpp
 create mode 100644 src/kernels/gpu_general_tensor_reorder_kernel/general_tensor_reorder_2x256_half.cpp
 create mode 100644 src/kernels/gpu_general_tensor_reorder_kernel/general_tensor_reorder_4x256_byte.cpp
 create mode 100644 src/kernels/gpu_general_tensor_reorder_kernel/general_tensor_reorder_4x256_dword.cpp
 create mode 100644 src/kernels/gpu_general_tensor_reorder_kernel/general_tensor_reorder_4x256_dwordx2.cpp
 create mode 100644 src/kernels/gpu_general_tensor_reorder_kernel/general_tensor_reorder_4x256_half.cpp
 create mode 100644 src/kernels/gpu_general_tensor_reorder_kernel/general_tensor_reorder_8x256_byte.cpp
 create mode 100644 src/kernels/gpu_general_tensor_reorder_kernel/general_tensor_reorder_8x256_dword.cpp
 create mode 100644 src/kernels/gpu_general_tensor_reorder_kernel/general_tensor_reorder_8x256_dwordx2.cpp
 create mode 100644 src/kernels/gpu_general_tensor_reorder_kernel/general_tensor_reorder_8x256_half.cpp
 create mode 100644 src/kernels/gpu_general_tensor_reorder_kernel/general_tensor_reorder_kernel_util.hpp
 delete mode 100644 src/kernels/gpu_general_tensor_reorder_kernel/order.hpp

diff --git a/src/hip/general_tensor_reorder_sol.cpp b/src/hip/general_tensor_reorder_sol.cpp
index 2012a574a0..f78aa358f8 100644
--- a/src/hip/general_tensor_reorder_sol.cpp
+++ b/src/hip/general_tensor_reorder_sol.cpp
@@ -52,6 +52,17 @@ static inline std::string GetKernelNameType(std::size_t type_size)
     MIOPEN_THROW("data type not supported");
 }
 
+static inline std::string GetKernelFileName(std::size_t data_size,
+                                            const GeneralReorderParam* kparam)
+{
+    if(kparam == nullptr)
+        MIOPEN_THROW("Memory access fault, kparam is a nullptr");
+    std::ostringstream kernel_file_name;
+    kernel_file_name << "general_tensor_reorder_" << kparam->tile_x << "x" << kparam->tile_y << "_";
+    kernel_file_name << GetKernelNameType(data_size) << ".cpp";
+    return kernel_file_name.str();
+}
+
 static inline std::string GetKernelName(std::size_t data_size,
                                         uint32_t order_0,
                                         uint32_t order_1,
@@ -140,9 +151,10 @@ solver::KernelInfo GenericReorderSolutionImpl::GetKernelInfo() const
                          (block_size * kernel_param_heuristic.tile_x);
     std::size_t grid_size = dim_total;
 
-    std::string kernel_name = GetKernelName();
+    std::string kernel_name      = GetKernelName();
+    std::string kernel_file_name = GetKernelFileName();
     solver::KernelInfo kernel;
-    kernel.kernel_file = "general_tensor_reorder.cpp";
+    kernel.kernel_file = kernel_file_name;
     kernel.kernel_name = kernel_name;
     kernel.g_wk.clear();
     kernel.g_wk.push_back(grid_size * block_size);
@@ -191,6 +203,12 @@ std::vector<OpKernelArg> GenericReorderSolutionImpl::GetKernelArg() const
     return opArgs;
 }
 
+std::string GenericReorderSolutionImpl::GetKernelFileName() const
+{
+    return tensor_reorder::GetKernelFileName(miopen::GetTypeSize(data_type),
+                                             &kernel_param_heuristic);
+}
+
 std::string GenericReorderSolutionImpl::GetKernelName() const
 {
     return tensor_reorder::GetKernelName(miopen::GetTypeSize(data_type),
diff --git a/src/include/miopen/general_tensor_reorder_sol.hpp b/src/include/miopen/general_tensor_reorder_sol.hpp
index 41f387f995..0df1d9cd0c 100644
--- a/src/include/miopen/general_tensor_reorder_sol.hpp
+++ b/src/include/miopen/general_tensor_reorder_sol.hpp
@@ -58,6 +58,7 @@ struct GenericReorderSolutionImpl
     // TODO batched transpose API
     solver::KernelInfo GetKernelInfo() const;
     std::vector<OpKernelArg> GetKernelArg() const;
+    std::string GetKernelFileName() const;
     std::string GetKernelName() const;
     bool IsSkippable() const;
     size_t GetOutputTensorSize() const;
diff --git a/src/include/miopen/tensor_reorder_util.hpp b/src/include/miopen/tensor_reorder_util.hpp
index 1010b98e95..24b7b96650 100644
--- a/src/include/miopen/tensor_reorder_util.hpp
+++ b/src/include/miopen/tensor_reorder_util.hpp
@@ -169,26 +169,49 @@ struct GenericReorderSolution : TensorReorderAttributesBase
 inline std::unique_ptr<TensorReorderAttributesBase>
 MakeTensorReorderAttributes(const ExecutionContext& ctx_,
                             miopenDataType_t data_type_,
-                            uint32_t dim_0_,
-                            uint32_t dim_1_,
-                            uint32_t dim_2_,
-                            uint32_t dim_3_,
+                            uint64_t dim_0_,
+                            uint64_t dim_1_,
+                            uint64_t dim_2_,
+                            uint64_t dim_3_,
                             uint32_t order_0_,
                             uint32_t order_1_,
                             uint32_t order_2_,
                             uint32_t order_3_)
 {
-    std::unique_ptr<TensorReorderAttributesBase> default_ptr;
     if(!ctx_.use_hip_kernels)
     {
-        return default_ptr;
+        return nullptr;
+    }
+    ///\todo 64-bit tensor dimension
+    // Currentlly we have tensor dimension limited to 2^32 - 1
+    if((dim_0_ > UINT32_MAX) || (dim_1_ > UINT32_MAX) || (dim_2_ > UINT32_MAX) ||
+       (dim_3_ > UINT32_MAX))
+    {
+        MIOPEN_THROW("Currentlly we have tensor dimension limitation of 2^32 - 1");
     }
     // Default using general reorder
     if(data_type_ == miopenBFloat16)
     {
         MIOPEN_THROW("Unsupported reorder data type");
     }
+    // Check required reorder is legal or not
+    std::vector<std::vector<uint32_t>> all_possible_order{
+        {0, 1, 3, 2}, {0, 2, 1, 3}, {0, 2, 3, 1}, {0, 3, 1, 2}, {0, 3, 2, 1}, {1, 0, 2, 3},
+        {1, 0, 3, 2}, {1, 2, 0, 3}, {1, 2, 3, 0}, {1, 3, 0, 2}, {1, 3, 2, 0}, {2, 0, 1, 3},
+        {2, 0, 3, 1}, {2, 1, 0, 3}, {2, 1, 3, 0}, {2, 3, 0, 1}, {2, 3, 1, 0}, {3, 0, 1, 2},
+        {3, 0, 2, 1}, {3, 1, 0, 2}, {3, 1, 2, 0}, {3, 2, 0, 1}, {3, 2, 1, 0}};
+    auto found = std::find_if(
+        all_possible_order.begin(),
+        all_possible_order.end(),
+        [order_0_, order_1_, order_2_, order_3_](std::vector<uint32_t> possible_order) {
+            return (possible_order[0] == order_0_) && (possible_order[1] == order_1_) &&
+                   (possible_order[2] == order_2_) && (possible_order[3] == order_3_);
+        });
+    if(found == all_possible_order.end())
+        MIOPEN_THROW("Unsupported Reorder");
+
     int which = 0;
+    // Special cases that utilize batched transpose kernel
     if(data_type_ != miopenDouble)
     {
         if((order_0_ == 0) && (order_1_ == 1) && (order_2_ == 3) && (order_3_ == 2))
@@ -235,9 +258,8 @@ MakeTensorReorderAttributes(const ExecutionContext& ctx_,
     case 5:
         return std::make_unique<BatchedTransposeSolution_3012>(
             ctx_, data_type_, dim_0_, dim_1_, dim_2_, dim_3_);
-    default: MIOPEN_THROW("Unsupported reorder sequence"); break;
+    default: MIOPEN_THROW("Failed to call tensor reorder solver");
     }
-    return default_ptr;
 }
 
 } // namespace miopen
diff --git a/src/kernels/gpu_general_tensor_reorder_kernel/general_tensor_reorder.cpp b/src/kernels/gpu_general_tensor_reorder_kernel/general_tensor_reorder.cpp
deleted file mode 100644
index e8f236c36e..0000000000
--- a/src/kernels/gpu_general_tensor_reorder_kernel/general_tensor_reorder.cpp
+++ /dev/null
@@ -1,890 +0,0 @@
-/*******************************************************************************
- *
- * MIT License
- *
- * Copyright (c) 2020-2022 Advanced Micro Devices, Inc.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- *
- *******************************************************************************/
-#ifndef MIOPEN_DONT_USE_HIP_RUNTIME_HEADERS
-#include <hip/hip_runtime.h>
-#endif
-#include <hip/hip_fp16.h>
-#include "order.hpp"
-
-#ifndef TENSOR_REORDER_OCCUPANCY
-#define TENSOR_REORDER_OCCUPANCY 4
-#endif
-
-inline __device__ uint32_t magic_div_u32(const uint32_t& numer,
-                                         const uint32_t& magic,
-                                         const uint32_t& shift)
-{
-    uint32_t tmp = __umulhi(numer, magic);
-    return (tmp + numer) >> shift;
-}
-
-template <typename T, typename dst_order>
-inline __device__ void general_4d_reorder_1x256(T* dst,
-                                                T* src,
-                                                uint32_t dim_0,
-                                                uint32_t dim_1,
-                                                uint32_t dim_2,
-                                                uint32_t dim_3,
-                                                uint32_t dim_stride,
-                                                uint32_t dim_total,
-                                                uint32_t magic_stride0,
-                                                uint32_t shift_stride0,
-                                                uint32_t magic_stride1,
-                                                uint32_t shift_stride1,
-                                                uint32_t magic_stride2,
-                                                uint32_t shift_stride2)
-{
-    constexpr auto dorder = dst_order{};
-    uint32_t pixel_total  = dim_0 * dim_1 * dim_2 * dim_3;
-    uint32_t src_index, dst_index;
-    const uint64_t src_dim[4] = {dim_0, dim_1, dim_2, dim_3};
-    const uint64_t dst_dim[4] = {
-        src_dim[dorder.at(0)], src_dim[dorder.at(1)], src_dim[dorder.at(2)], src_dim[dorder.at(3)]};
-    const uint64_t src_stride[4] = {
-        src_dim[1] * src_dim[2] * src_dim[3], src_dim[2] * src_dim[3], src_dim[3], 1};
-    const uint64_t dst_stride[4] = {
-        dst_dim[1] * dst_dim[2] * dst_dim[3], dst_dim[2] * dst_dim[3], dst_dim[3], 1};
-
-    uint32_t i_src[4] = {0, 0, 0, 0};
-    uint32_t i_dst[4] = {0, 0, 0, 0};
-
-    for(uint32_t dim_id = blockIdx.x; dim_id < dim_total; dim_id += dim_stride)
-    {
-        for(uint32_t k = 0; k < 1; k++)
-        {
-            // unroll k         block          thread
-            src_index = k * dim_total * 256 + dim_id * 256 + threadIdx.x;
-            if(src_index < pixel_total)
-            {
-                i_src[0] = magic_div_u32(src_index, magic_stride0, shift_stride0);
-                i_src[1] = magic_div_u32(
-                    src_index - i_src[0] * src_stride[0], magic_stride1, shift_stride1);
-                i_src[2] =
-                    magic_div_u32(src_index - i_src[0] * src_stride[0] - i_src[1] * src_stride[1],
-                                  magic_stride2,
-                                  shift_stride2);
-                i_src[3] = src_index - i_src[0] * src_stride[0] - i_src[1] * src_stride[1] -
-                           i_src[2] * src_stride[2];
-
-                i_dst[0] = i_src[dorder.at(0)];
-                i_dst[1] = i_src[dorder.at(1)];
-                i_dst[2] = i_src[dorder.at(2)];
-                i_dst[3] = i_src[dorder.at(3)];
-
-                dst_index = i_dst[0] * dst_stride[0] + i_dst[1] * dst_stride[1] +
-                            i_dst[2] * dst_stride[2] + i_dst[3] * dst_stride[3];
-                dst[dst_index] = src[src_index];
-            }
-        }
-    }
-}
-
-template <typename T, typename dst_order>
-inline __device__ void general_4d_reorder_2x256(T* dst,
-                                                T* src,
-                                                uint32_t dim_0,
-                                                uint32_t dim_1,
-                                                uint32_t dim_2,
-                                                uint32_t dim_3,
-                                                uint32_t dim_stride,
-                                                uint32_t dim_total,
-                                                uint32_t magic_stride0,
-                                                uint32_t shift_stride0,
-                                                uint32_t magic_stride1,
-                                                uint32_t shift_stride1,
-                                                uint32_t magic_stride2,
-                                                uint32_t shift_stride2)
-{
-    constexpr auto dorder = dst_order{};
-    uint32_t pixel_total  = dim_0 * dim_1 * dim_2 * dim_3;
-    uint32_t src_index, dst_index;
-    const uint64_t src_dim[4] = {dim_0, dim_1, dim_2, dim_3};
-    const uint64_t dst_dim[4] = {
-        src_dim[dorder.at(0)], src_dim[dorder.at(1)], src_dim[dorder.at(2)], src_dim[dorder.at(3)]};
-    const uint64_t src_stride[4] = {
-        src_dim[1] * src_dim[2] * src_dim[3], src_dim[2] * src_dim[3], src_dim[3], 1};
-    const uint64_t dst_stride[4] = {
-        dst_dim[1] * dst_dim[2] * dst_dim[3], dst_dim[2] * dst_dim[3], dst_dim[3], 1};
-
-    uint32_t i_src[4] = {0, 0, 0, 0};
-    uint32_t i_dst[4] = {0, 0, 0, 0};
-
-    for(uint32_t dim_id = blockIdx.x; dim_id < dim_total; dim_id += dim_stride)
-    {
-        for(uint32_t k = 0; k < 2; k++)
-        {
-            // unroll k         block          thread
-            src_index = k * dim_total * 256 + dim_id * 256 + threadIdx.x;
-            if(src_index < pixel_total)
-            {
-                i_src[0] = magic_div_u32(src_index, magic_stride0, shift_stride0);
-                i_src[1] = magic_div_u32(
-                    src_index - i_src[0] * src_stride[0], magic_stride1, shift_stride1);
-                i_src[2] =
-                    magic_div_u32(src_index - i_src[0] * src_stride[0] - i_src[1] * src_stride[1],
-                                  magic_stride2,
-                                  shift_stride2);
-                i_src[3] = src_index - i_src[0] * src_stride[0] - i_src[1] * src_stride[1] -
-                           i_src[2] * src_stride[2];
-
-                i_dst[0] = i_src[dorder.at(0)];
-                i_dst[1] = i_src[dorder.at(1)];
-                i_dst[2] = i_src[dorder.at(2)];
-                i_dst[3] = i_src[dorder.at(3)];
-
-                dst_index = i_dst[0] * dst_stride[0] + i_dst[1] * dst_stride[1] +
-                            i_dst[2] * dst_stride[2] + i_dst[3] * dst_stride[3];
-                dst[dst_index] = src[src_index];
-            }
-        }
-    }
-}
-
-template <typename T, typename dst_order>
-inline __device__ void general_4d_reorder_4x256(T* dst,
-                                                T* src,
-                                                uint32_t dim_0,
-                                                uint32_t dim_1,
-                                                uint32_t dim_2,
-                                                uint32_t dim_3,
-                                                uint32_t dim_stride,
-                                                uint32_t dim_total,
-                                                uint32_t magic_stride0,
-                                                uint32_t shift_stride0,
-                                                uint32_t magic_stride1,
-                                                uint32_t shift_stride1,
-                                                uint32_t magic_stride2,
-                                                uint32_t shift_stride2)
-{
-    constexpr auto dorder = dst_order{};
-    uint32_t pixel_total  = dim_0 * dim_1 * dim_2 * dim_3;
-    uint32_t src_index, dst_index;
-    const uint64_t src_dim[4] = {dim_0, dim_1, dim_2, dim_3};
-    const uint64_t dst_dim[4] = {
-        src_dim[dorder.at(0)], src_dim[dorder.at(1)], src_dim[dorder.at(2)], src_dim[dorder.at(3)]};
-    const uint64_t src_stride[4] = {
-        src_dim[1] * src_dim[2] * src_dim[3], src_dim[2] * src_dim[3], src_dim[3], 1};
-    const uint64_t dst_stride[4] = {
-        dst_dim[1] * dst_dim[2] * dst_dim[3], dst_dim[2] * dst_dim[3], dst_dim[3], 1};
-
-    uint32_t i_src[4] = {0, 0, 0, 0};
-    uint32_t i_dst[4] = {0, 0, 0, 0};
-
-    for(uint32_t dim_id = blockIdx.x; dim_id < dim_total; dim_id += dim_stride)
-    {
-        for(uint32_t k = 0; k < 4; k++)
-        {
-            // unroll k         block          thread
-            src_index = k * dim_total * 256 + dim_id * 256 + threadIdx.x;
-            if(src_index < pixel_total)
-            {
-                i_src[0] = magic_div_u32(src_index, magic_stride0, shift_stride0);
-                i_src[1] = magic_div_u32(
-                    src_index - i_src[0] * src_stride[0], magic_stride1, shift_stride1);
-                i_src[2] =
-                    magic_div_u32(src_index - i_src[0] * src_stride[0] - i_src[1] * src_stride[1],
-                                  magic_stride2,
-                                  shift_stride2);
-                i_src[3] = src_index - i_src[0] * src_stride[0] - i_src[1] * src_stride[1] -
-                           i_src[2] * src_stride[2];
-
-                i_dst[0] = i_src[dorder.at(0)];
-                i_dst[1] = i_src[dorder.at(1)];
-                i_dst[2] = i_src[dorder.at(2)];
-                i_dst[3] = i_src[dorder.at(3)];
-
-                dst_index = i_dst[0] * dst_stride[0] + i_dst[1] * dst_stride[1] +
-                            i_dst[2] * dst_stride[2] + i_dst[3] * dst_stride[3];
-                dst[dst_index] = src[src_index];
-            }
-        }
-    }
-}
-
-template <typename T, typename dst_order>
-inline __device__ void general_4d_reorder_8x256(T* dst,
-                                                T* src,
-                                                uint32_t dim_0,
-                                                uint32_t dim_1,
-                                                uint32_t dim_2,
-                                                uint32_t dim_3,
-                                                uint32_t dim_stride,
-                                                uint32_t dim_total,
-                                                uint32_t magic_stride0,
-                                                uint32_t shift_stride0,
-                                                uint32_t magic_stride1,
-                                                uint32_t shift_stride1,
-                                                uint32_t magic_stride2,
-                                                uint32_t shift_stride2)
-{
-    constexpr auto dorder = dst_order{};
-    uint32_t pixel_total  = dim_0 * dim_1 * dim_2 * dim_3;
-    uint32_t src_index, dst_index;
-    const uint64_t src_dim[4] = {dim_0, dim_1, dim_2, dim_3};
-    const uint64_t dst_dim[4] = {
-        src_dim[dorder.at(0)], src_dim[dorder.at(1)], src_dim[dorder.at(2)], src_dim[dorder.at(3)]};
-    const uint64_t src_stride[4] = {
-        src_dim[1] * src_dim[2] * src_dim[3], src_dim[2] * src_dim[3], src_dim[3], 1};
-    const uint64_t dst_stride[4] = {
-        dst_dim[1] * dst_dim[2] * dst_dim[3], dst_dim[2] * dst_dim[3], dst_dim[3], 1};
-
-    uint32_t i_src[4] = {0, 0, 0, 0};
-    uint32_t i_dst[4] = {0, 0, 0, 0};
-
-    for(uint32_t dim_id = blockIdx.x; dim_id < dim_total; dim_id += dim_stride)
-    {
-        for(uint32_t k = 0; k < 8; k++)
-        {
-            // unroll k         block          thread
-            src_index = k * dim_total * 256 + dim_id * 256 + threadIdx.x;
-            if(src_index < pixel_total)
-            {
-                i_src[0] = magic_div_u32(src_index, magic_stride0, shift_stride0);
-                i_src[1] = magic_div_u32(
-                    src_index - i_src[0] * src_stride[0], magic_stride1, shift_stride1);
-                i_src[2] =
-                    magic_div_u32(src_index - i_src[0] * src_stride[0] - i_src[1] * src_stride[1],
-                                  magic_stride2,
-                                  shift_stride2);
-                i_src[3] = src_index - i_src[0] * src_stride[0] - i_src[1] * src_stride[1] -
-                           i_src[2] * src_stride[2];
-
-                i_dst[0] = i_src[dorder.at(0)];
-                i_dst[1] = i_src[dorder.at(1)];
-                i_dst[2] = i_src[dorder.at(2)];
-                i_dst[3] = i_src[dorder.at(3)];
-
-                dst_index = i_dst[0] * dst_stride[0] + i_dst[1] * dst_stride[1] +
-                            i_dst[2] * dst_stride[2] + i_dst[3] * dst_stride[3];
-                dst[dst_index] = src[src_index];
-            }
-        }
-    }
-}
-
-template <typename T, typename dst_order>
-inline __device__ void general_4d_reorder_16x256(T* dst,
-                                                 T* src,
-                                                 uint32_t dim_0,
-                                                 uint32_t dim_1,
-                                                 uint32_t dim_2,
-                                                 uint32_t dim_3,
-                                                 uint32_t dim_stride,
-                                                 uint32_t dim_total,
-                                                 uint32_t magic_stride0,
-                                                 uint32_t shift_stride0,
-                                                 uint32_t magic_stride1,
-                                                 uint32_t shift_stride1,
-                                                 uint32_t magic_stride2,
-                                                 uint32_t shift_stride2)
-{
-    constexpr auto dorder = dst_order{};
-    uint32_t pixel_total  = dim_0 * dim_1 * dim_2 * dim_3;
-    uint32_t src_index, dst_index;
-    const uint64_t src_dim[4] = {dim_0, dim_1, dim_2, dim_3};
-    const uint64_t dst_dim[4] = {
-        src_dim[dorder.at(0)], src_dim[dorder.at(1)], src_dim[dorder.at(2)], src_dim[dorder.at(3)]};
-    const uint64_t src_stride[4] = {
-        src_dim[1] * src_dim[2] * src_dim[3], src_dim[2] * src_dim[3], src_dim[3], 1};
-    const uint64_t dst_stride[4] = {
-        dst_dim[1] * dst_dim[2] * dst_dim[3], dst_dim[2] * dst_dim[3], dst_dim[3], 1};
-
-    uint32_t i_src[4] = {0, 0, 0, 0};
-    uint32_t i_dst[4] = {0, 0, 0, 0};
-
-    for(uint32_t dim_id = blockIdx.x; dim_id < dim_total; dim_id += dim_stride)
-    {
-        for(uint32_t k = 0; k < 16; k++)
-        {
-            // unroll k         block          thread
-            src_index = k * dim_total * 256 + dim_id * 256 + threadIdx.x;
-            if(src_index < pixel_total)
-            {
-                i_src[0] = magic_div_u32(src_index, magic_stride0, shift_stride0);
-                i_src[1] = magic_div_u32(
-                    src_index - i_src[0] * src_stride[0], magic_stride1, shift_stride1);
-                i_src[2] =
-                    magic_div_u32(src_index - i_src[0] * src_stride[0] - i_src[1] * src_stride[1],
-                                  magic_stride2,
-                                  shift_stride2);
-                i_src[3] = src_index - i_src[0] * src_stride[0] - i_src[1] * src_stride[1] -
-                           i_src[2] * src_stride[2];
-
-                i_dst[0] = i_src[dorder.at(0)];
-                i_dst[1] = i_src[dorder.at(1)];
-                i_dst[2] = i_src[dorder.at(2)];
-                i_dst[3] = i_src[dorder.at(3)];
-
-                dst_index = i_dst[0] * dst_stride[0] + i_dst[1] * dst_stride[1] +
-                            i_dst[2] * dst_stride[2] + i_dst[3] * dst_stride[3];
-                dst[dst_index] = src[src_index];
-            }
-        }
-    }
-}
-
-#define DEFINE_GENERAL_4D_REORDER_KERNEL(tile_trait,                                               \
-                                         dst_order,                                                \
-                                         accept_data_type,                                         \
-                                         cast_data_type,                                           \
-                                         lb_threads_per_block,                                     \
-                                         lb_blocks_per_cu)                                         \
-    extern "C" __global__ void __launch_bounds__(lb_threads_per_block, lb_blocks_per_cu)           \
-        general_4d_reorder_##tile_trait##_##accept_data_type##_##dst_order(void* dst,              \
-                                                                           void* src,              \
-                                                                           uint32_t dim_0,         \
-                                                                           uint32_t dim_1,         \
-                                                                           uint32_t dim_2,         \
-                                                                           uint32_t dim_3,         \
-                                                                           uint32_t dim_stride,    \
-                                                                           uint32_t dim_total,     \
-                                                                           uint32_t magic_stride0, \
-                                                                           uint32_t shift_stride0, \
-                                                                           uint32_t magic_stride1, \
-                                                                           uint32_t shift_stride1, \
-                                                                           uint32_t magic_stride2, \
-                                                                           uint32_t shift_stride2) \
-    {                                                                                              \
-        general_4d_reorder_##tile_trait<cast_data_type, dst_order>(                                \
-            reinterpret_cast<cast_data_type*>(dst),                                                \
-            reinterpret_cast<cast_data_type*>(src),                                                \
-            dim_0,                                                                                 \
-            dim_1,                                                                                 \
-            dim_2,                                                                                 \
-            dim_3,                                                                                 \
-            dim_stride,                                                                            \
-            dim_total,                                                                             \
-            magic_stride0,                                                                         \
-            shift_stride0,                                                                         \
-            magic_stride1,                                                                         \
-            shift_stride1,                                                                         \
-            magic_stride2,                                                                         \
-            shift_stride2);                                                                        \
-    }
-// default order is 0 1 2 3
-using r0132 = order<0, 1, 3, 2>;
-using r0213 = order<0, 2, 1, 3>; // nhwc2nchwc
-using r0231 = order<0, 2, 3, 1>; // nchw2nchwc
-using r0312 = order<0, 3, 1, 2>; // nhwc2nchw
-using r0321 = order<0, 3, 2, 1>;
-using r1023 = order<1, 0, 2, 3>;
-using r1032 = order<1, 0, 3, 2>;
-using r1203 = order<1, 2, 0, 3>;
-using r1230 = order<1, 2, 3, 0>;
-using r1302 = order<1, 3, 0, 2>; // nchw2chwnc
-using r1320 = order<1, 3, 2, 0>;
-using r2013 = order<2, 0, 1, 3>;
-using r2031 = order<2, 0, 3, 1>;
-using r2103 = order<2, 1, 0, 3>; // nhwc2chwnc
-using r2130 = order<2, 1, 3, 0>;
-using r2301 = order<2, 3, 0, 1>;
-using r2310 = order<2, 3, 1, 0>;
-using r3012 = order<3, 0, 1, 2>;
-using r3021 = order<3, 0, 2, 1>;
-using r3102 = order<3, 1, 0, 2>;
-using r3120 = order<3, 1, 2, 0>;
-using r3201 = order<3, 2, 0, 1>;
-using r3210 = order<3, 2, 1, 0>;
-
-DEFINE_GENERAL_4D_REORDER_KERNEL(1x256, r0132, dwordx2, double, 256, TENSOR_REORDER_OCCUPANCY)
-DEFINE_GENERAL_4D_REORDER_KERNEL(1x256, r0213, dwordx2, double, 256, TENSOR_REORDER_OCCUPANCY)
-DEFINE_GENERAL_4D_REORDER_KERNEL(1x256, r0231, dwordx2, double, 256, TENSOR_REORDER_OCCUPANCY)
-DEFINE_GENERAL_4D_REORDER_KERNEL(1x256, r0312, dwordx2, double, 256, TENSOR_REORDER_OCCUPANCY)
-DEFINE_GENERAL_4D_REORDER_KERNEL(1x256, r0321, dwordx2, double, 256, TENSOR_REORDER_OCCUPANCY)
-DEFINE_GENERAL_4D_REORDER_KERNEL(1x256, r1023, dwordx2, double, 256, TENSOR_REORDER_OCCUPANCY)
-DEFINE_GENERAL_4D_REORDER_KERNEL(1x256, r1032, dwordx2, double, 256, TENSOR_REORDER_OCCUPANCY)
-DEFINE_GENERAL_4D_REORDER_KERNEL(1x256, r1203, dwordx2, double, 256, TENSOR_REORDER_OCCUPANCY)
-DEFINE_GENERAL_4D_REORDER_KERNEL(1x256, r1230, dwordx2, double, 256, TENSOR_REORDER_OCCUPANCY)
-DEFINE_GENERAL_4D_REORDER_KERNEL(1x256, r1302, dwordx2, double, 256, TENSOR_REORDER_OCCUPANCY)
-DEFINE_GENERAL_4D_REORDER_KERNEL(1x256, r1320, dwordx2, double, 256, TENSOR_REORDER_OCCUPANCY)
-DEFINE_GENERAL_4D_REORDER_KERNEL(1x256, r2013, dwordx2, double, 256, TENSOR_REORDER_OCCUPANCY)
-DEFINE_GENERAL_4D_REORDER_KERNEL(1x256, r2031, dwordx2, double, 256, TENSOR_REORDER_OCCUPANCY)
-DEFINE_GENERAL_4D_REORDER_KERNEL(1x256, r2103, dwordx2, double, 256, TENSOR_REORDER_OCCUPANCY)
-DEFINE_GENERAL_4D_REORDER_KERNEL(1x256, r2130, dwordx2, double, 256, TENSOR_REORDER_OCCUPANCY)
-DEFINE_GENERAL_4D_REORDER_KERNEL(1x256, r2301, dwordx2, double, 256, TENSOR_REORDER_OCCUPANCY)
-DEFINE_GENERAL_4D_REORDER_KERNEL(1x256, r2310, dwordx2, double, 256, TENSOR_REORDER_OCCUPANCY)
-DEFINE_GENERAL_4D_REORDER_KERNEL(1x256, r3012, dwordx2, double, 256, TENSOR_REORDER_OCCUPANCY)
-DEFINE_GENERAL_4D_REORDER_KERNEL(1x256, r3021, dwordx2, double, 256, TENSOR_REORDER_OCCUPANCY)
-DEFINE_GENERAL_4D_REORDER_KERNEL(1x256, r3102, dwordx2, double, 256, TENSOR_REORDER_OCCUPANCY)
-DEFINE_GENERAL_4D_REORDER_KERNEL(1x256, r3120, dwordx2, double, 256, TENSOR_REORDER_OCCUPANCY)
-DEFINE_GENERAL_4D_REORDER_KERNEL(1x256, r3201, dwordx2, double, 256, TENSOR_REORDER_OCCUPANCY)
-DEFINE_GENERAL_4D_REORDER_KERNEL(1x256, r3210, dwordx2, double, 256, TENSOR_REORDER_OCCUPANCY)
-
-DEFINE_GENERAL_4D_REORDER_KERNEL(2x256, r0132, dwordx2, double, 256, TENSOR_REORDER_OCCUPANCY)
-DEFINE_GENERAL_4D_REORDER_KERNEL(2x256, r0213, dwordx2, double, 256, TENSOR_REORDER_OCCUPANCY)
-DEFINE_GENERAL_4D_REORDER_KERNEL(2x256, r0231, dwordx2, double, 256, TENSOR_REORDER_OCCUPANCY)
-DEFINE_GENERAL_4D_REORDER_KERNEL(2x256, r0312, dwordx2, double, 256, TENSOR_REORDER_OCCUPANCY)
-DEFINE_GENERAL_4D_REORDER_KERNEL(2x256, r0321, dwordx2, double, 256, TENSOR_REORDER_OCCUPANCY)
-DEFINE_GENERAL_4D_REORDER_KERNEL(2x256, r1023, dwordx2, double, 256, TENSOR_REORDER_OCCUPANCY)
-DEFINE_GENERAL_4D_REORDER_KERNEL(2x256, r1032, dwordx2, double, 256, TENSOR_REORDER_OCCUPANCY)
-DEFINE_GENERAL_4D_REORDER_KERNEL(2x256, r1203, dwordx2, double, 256, TENSOR_REORDER_OCCUPANCY)
-DEFINE_GENERAL_4D_REORDER_KERNEL(2x256, r1230, dwordx2, double, 256, TENSOR_REORDER_OCCUPANCY)
-DEFINE_GENERAL_4D_REORDER_KERNEL(2x256, r1302, dwordx2, double, 256, TENSOR_REORDER_OCCUPANCY)
-DEFINE_GENERAL_4D_REORDER_KERNEL(2x256, r1320, dwordx2, double, 256, TENSOR_REORDER_OCCUPANCY)
-DEFINE_GENERAL_4D_REORDER_KERNEL(2x256, r2013, dwordx2, double, 256, TENSOR_REORDER_OCCUPANCY)
-DEFINE_GENERAL_4D_REORDER_KERNEL(2x256, r2031, dwordx2, double, 256, TENSOR_REORDER_OCCUPANCY)
-DEFINE_GENERAL_4D_REORDER_KERNEL(2x256, r2103, dwordx2, double, 256, TENSOR_REORDER_OCCUPANCY)
-DEFINE_GENERAL_4D_REORDER_KERNEL(2x256, r2130, dwordx2, double, 256, TENSOR_REORDER_OCCUPANCY)
-DEFINE_GENERAL_4D_REORDER_KERNEL(2x256, r2301, dwordx2, double, 256, TENSOR_REORDER_OCCUPANCY)
-DEFINE_GENERAL_4D_REORDER_KERNEL(2x256, r2310, dwordx2, double, 256, TENSOR_REORDER_OCCUPANCY)
-DEFINE_GENERAL_4D_REORDER_KERNEL(2x256, r3012, dwordx2, double, 256, TENSOR_REORDER_OCCUPANCY)
-DEFINE_GENERAL_4D_REORDER_KERNEL(2x256, r3021, dwordx2, double, 256, TENSOR_REORDER_OCCUPANCY)
-DEFINE_GENERAL_4D_REORDER_KERNEL(2x256, r3102, dwordx2, double, 256, TENSOR_REORDER_OCCUPANCY)
-DEFINE_GENERAL_4D_REORDER_KERNEL(2x256, r3120, dwordx2, double, 256, TENSOR_REORDER_OCCUPANCY)
-DEFINE_GENERAL_4D_REORDER_KERNEL(2x256, r3201, dwordx2, double, 256, TENSOR_REORDER_OCCUPANCY)
-DEFINE_GENERAL_4D_REORDER_KERNEL(2x256, r3210, dwordx2, double, 256, TENSOR_REORDER_OCCUPANCY)
-
-DEFINE_GENERAL_4D_REORDER_KERNEL(4x256, r0132, dwordx2, double, 256, TENSOR_REORDER_OCCUPANCY)
-DEFINE_GENERAL_4D_REORDER_KERNEL(4x256, r0213, dwordx2, double, 256, TENSOR_REORDER_OCCUPANCY)
-DEFINE_GENERAL_4D_REORDER_KERNEL(4x256, r0231, dwordx2, double, 256, TENSOR_REORDER_OCCUPANCY)
-DEFINE_GENERAL_4D_REORDER_KERNEL(4x256, r0312, dwordx2, double, 256, TENSOR_REORDER_OCCUPANCY)
-DEFINE_GENERAL_4D_REORDER_KERNEL(4x256, r0321, dwordx2, double, 256, TENSOR_REORDER_OCCUPANCY)
-DEFINE_GENERAL_4D_REORDER_KERNEL(4x256, r1023, dwordx2, double, 256, TENSOR_REORDER_OCCUPANCY)
-DEFINE_GENERAL_4D_REORDER_KERNEL(4x256, r1032, dwordx2, double, 256, TENSOR_REORDER_OCCUPANCY)
-DEFINE_GENERAL_4D_REORDER_KERNEL(4x256, r1203, dwordx2, double, 256, TENSOR_REORDER_OCCUPANCY)
-DEFINE_GENERAL_4D_REORDER_KERNEL(4x256, r1230, dwordx2, double, 256, TENSOR_REORDER_OCCUPANCY)
-DEFINE_GENERAL_4D_REORDER_KERNEL(4x256, r1302, dwordx2, double, 256, TENSOR_REORDER_OCCUPANCY)
-DEFINE_GENERAL_4D_REORDER_KERNEL(4x256, r1320, dwordx2, double, 256, TENSOR_REORDER_OCCUPANCY)
-DEFINE_GENERAL_4D_REORDER_KERNEL(4x256, r2013, dwordx2, double, 256, TENSOR_REORDER_OCCUPANCY)
-DEFINE_GENERAL_4D_REORDER_KERNEL(4x256, r2031, dwordx2, double, 256, TENSOR_REORDER_OCCUPANCY)
-DEFINE_GENERAL_4D_REORDER_KERNEL(4x256, r2103, dwordx2, double, 256, TENSOR_REORDER_OCCUPANCY)
-DEFINE_GENERAL_4D_REORDER_KERNEL(4x256, r2130, dwordx2, double, 256, TENSOR_REORDER_OCCUPANCY)
-DEFINE_GENERAL_4D_REORDER_KERNEL(4x256, r2301, dwordx2, double, 256, TENSOR_REORDER_OCCUPANCY)
-DEFINE_GENERAL_4D_REORDER_KERNEL(4x256, r2310, dwordx2, double, 256, TENSOR_REORDER_OCCUPANCY)
-DEFINE_GENERAL_4D_REORDER_KERNEL(4x256, r3012, dwordx2, double, 256, TENSOR_REORDER_OCCUPANCY)
-DEFINE_GENERAL_4D_REORDER_KERNEL(4x256, r3021, dwordx2, double, 256, TENSOR_REORDER_OCCUPANCY)
-DEFINE_GENERAL_4D_REORDER_KERNEL(4x256, r3102, dwordx2, double, 256, TENSOR_REORDER_OCCUPANCY)
-DEFINE_GENERAL_4D_REORDER_KERNEL(4x256, r3120, dwordx2, double, 256, TENSOR_REORDER_OCCUPANCY)
-DEFINE_GENERAL_4D_REORDER_KERNEL(4x256, r3201, dwordx2, double, 256, TENSOR_REORDER_OCCUPANCY)
-DEFINE_GENERAL_4D_REORDER_KERNEL(4x256, r3210, dwordx2, double, 256, TENSOR_REORDER_OCCUPANCY)
-
-DEFINE_GENERAL_4D_REORDER_KERNEL(8x256, r0132, dwordx2, double, 256, TENSOR_REORDER_OCCUPANCY)
-DEFINE_GENERAL_4D_REORDER_KERNEL(8x256, r0213, dwordx2, double, 256, TENSOR_REORDER_OCCUPANCY)
-DEFINE_GENERAL_4D_REORDER_KERNEL(8x256, r0231, dwordx2, double, 256, TENSOR_REORDER_OCCUPANCY)
-DEFINE_GENERAL_4D_REORDER_KERNEL(8x256, r0312, dwordx2, double, 256, TENSOR_REORDER_OCCUPANCY)
-DEFINE_GENERAL_4D_REORDER_KERNEL(8x256, r0321, dwordx2, double, 256, TENSOR_REORDER_OCCUPANCY)
-DEFINE_GENERAL_4D_REORDER_KERNEL(8x256, r1023, dwordx2, double, 256, TENSOR_REORDER_OCCUPANCY)
-DEFINE_GENERAL_4D_REORDER_KERNEL(8x256, r1032, dwordx2, double, 256, TENSOR_REORDER_OCCUPANCY)
-DEFINE_GENERAL_4D_REORDER_KERNEL(8x256, r1203, dwordx2, double, 256, TENSOR_REORDER_OCCUPANCY)
-DEFINE_GENERAL_4D_REORDER_KERNEL(8x256, r1230, dwordx2, double, 256, TENSOR_REORDER_OCCUPANCY)
-DEFINE_GENERAL_4D_REORDER_KERNEL(8x256, r1302, dwordx2, double, 256, TENSOR_REORDER_OCCUPANCY)
-DEFINE_GENERAL_4D_REORDER_KERNEL(8x256, r1320, dwordx2, double, 256, TENSOR_REORDER_OCCUPANCY)
-DEFINE_GENERAL_4D_REORDER_KERNEL(8x256, r2013, dwordx2, double, 256, TENSOR_REORDER_OCCUPANCY)
-DEFINE_GENERAL_4D_REORDER_KERNEL(8x256, r2031, dwordx2, double, 256, TENSOR_REORDER_OCCUPANCY)
-DEFINE_GENERAL_4D_REORDER_KERNEL(8x256, r2103, dwordx2, double, 256, TENSOR_REORDER_OCCUPANCY)
-DEFINE_GENERAL_4D_REORDER_KERNEL(8x256, r2130, dwordx2, double, 256, TENSOR_REORDER_OCCUPANCY)
-DEFINE_GENERAL_4D_REORDER_KERNEL(8x256, r2301, dwordx2, double, 256, TENSOR_REORDER_OCCUPANCY)
-DEFINE_GENERAL_4D_REORDER_KERNEL(8x256, r2310, dwordx2, double, 256, TENSOR_REORDER_OCCUPANCY)
-DEFINE_GENERAL_4D_REORDER_KERNEL(8x256, r3012, dwordx2, double, 256, TENSOR_REORDER_OCCUPANCY)
-DEFINE_GENERAL_4D_REORDER_KERNEL(8x256, r3021, dwordx2, double, 256, TENSOR_REORDER_OCCUPANCY)
-DEFINE_GENERAL_4D_REORDER_KERNEL(8x256, r3102, dwordx2, double, 256, TENSOR_REORDER_OCCUPANCY)
-DEFINE_GENERAL_4D_REORDER_KERNEL(8x256, r3120, dwordx2, double, 256, TENSOR_REORDER_OCCUPANCY)
-DEFINE_GENERAL_4D_REORDER_KERNEL(8x256, r3201, dwordx2, double, 256, TENSOR_REORDER_OCCUPANCY)
-DEFINE_GENERAL_4D_REORDER_KERNEL(8x256, r3210, dwordx2, double, 256, TENSOR_REORDER_OCCUPANCY)
-
-DEFINE_GENERAL_4D_REORDER_KERNEL(16x256, r0132, dwordx2, double, 256, TENSOR_REORDER_OCCUPANCY)
-DEFINE_GENERAL_4D_REORDER_KERNEL(16x256, r0213, dwordx2, double, 256, TENSOR_REORDER_OCCUPANCY)
-DEFINE_GENERAL_4D_REORDER_KERNEL(16x256, r0231, dwordx2, double, 256, TENSOR_REORDER_OCCUPANCY)
-DEFINE_GENERAL_4D_REORDER_KERNEL(16x256, r0312, dwordx2, double, 256, TENSOR_REORDER_OCCUPANCY)
-DEFINE_GENERAL_4D_REORDER_KERNEL(16x256, r0321, dwordx2, double, 256, TENSOR_REORDER_OCCUPANCY)
-DEFINE_GENERAL_4D_REORDER_KERNEL(16x256, r1023, dwordx2, double, 256, TENSOR_REORDER_OCCUPANCY)
-DEFINE_GENERAL_4D_REORDER_KERNEL(16x256, r1032, dwordx2, double, 256, TENSOR_REORDER_OCCUPANCY)
-DEFINE_GENERAL_4D_REORDER_KERNEL(16x256, r1203, dwordx2, double, 256, TENSOR_REORDER_OCCUPANCY)
-DEFINE_GENERAL_4D_REORDER_KERNEL(16x256, r1230, dwordx2, double, 256, TENSOR_REORDER_OCCUPANCY)
-DEFINE_GENERAL_4D_REORDER_KERNEL(16x256, r1302, dwordx2, double, 256, TENSOR_REORDER_OCCUPANCY)
-DEFINE_GENERAL_4D_REORDER_KERNEL(16x256, r1320, dwordx2, double, 256, TENSOR_REORDER_OCCUPANCY)
-DEFINE_GENERAL_4D_REORDER_KERNEL(16x256, r2013, dwordx2, double, 256, TENSOR_REORDER_OCCUPANCY)
-DEFINE_GENERAL_4D_REORDER_KERNEL(16x256, r2031, dwordx2, double, 256, TENSOR_REORDER_OCCUPANCY)
-DEFINE_GENERAL_4D_REORDER_KERNEL(16x256, r2103, dwordx2, double, 256, TENSOR_REORDER_OCCUPANCY)
-DEFINE_GENERAL_4D_REORDER_KERNEL(16x256, r2130, dwordx2, double, 256, TENSOR_REORDER_OCCUPANCY)
-DEFINE_GENERAL_4D_REORDER_KERNEL(16x256, r2301, dwordx2, double, 256, TENSOR_REORDER_OCCUPANCY)
-DEFINE_GENERAL_4D_REORDER_KERNEL(16x256, r2310, dwordx2, double, 256, TENSOR_REORDER_OCCUPANCY)
-DEFINE_GENERAL_4D_REORDER_KERNEL(16x256, r3012, dwordx2, double, 256, TENSOR_REORDER_OCCUPANCY)
-DEFINE_GENERAL_4D_REORDER_KERNEL(16x256, r3021, dwordx2, double, 256, TENSOR_REORDER_OCCUPANCY)
-DEFINE_GENERAL_4D_REORDER_KERNEL(16x256, r3102, dwordx2, double, 256, TENSOR_REORDER_OCCUPANCY)
-DEFINE_GENERAL_4D_REORDER_KERNEL(16x256, r3120, dwordx2, double, 256, TENSOR_REORDER_OCCUPANCY)
-DEFINE_GENERAL_4D_REORDER_KERNEL(16x256, r3201, dwordx2, double, 256, TENSOR_REORDER_OCCUPANCY)
-DEFINE_GENERAL_4D_REORDER_KERNEL(16x256, r3210, dwordx2, double, 256, TENSOR_REORDER_OCCUPANCY)
-
-DEFINE_GENERAL_4D_REORDER_KERNEL(1x256, r0132, dword, float, 256, TENSOR_REORDER_OCCUPANCY)
-DEFINE_GENERAL_4D_REORDER_KERNEL(1x256, r0213, dword, float, 256, TENSOR_REORDER_OCCUPANCY)
-DEFINE_GENERAL_4D_REORDER_KERNEL(1x256, r0231, dword, float, 256, TENSOR_REORDER_OCCUPANCY)
-DEFINE_GENERAL_4D_REORDER_KERNEL(1x256, r0312, dword, float, 256, TENSOR_REORDER_OCCUPANCY)
-DEFINE_GENERAL_4D_REORDER_KERNEL(1x256, r0321, dword, float, 256, TENSOR_REORDER_OCCUPANCY)
-DEFINE_GENERAL_4D_REORDER_KERNEL(1x256, r1023, dword, float, 256, TENSOR_REORDER_OCCUPANCY)
-DEFINE_GENERAL_4D_REORDER_KERNEL(1x256, r1032, dword, float, 256, TENSOR_REORDER_OCCUPANCY)
-DEFINE_GENERAL_4D_REORDER_KERNEL(1x256, r1203, dword, float, 256, TENSOR_REORDER_OCCUPANCY)
-DEFINE_GENERAL_4D_REORDER_KERNEL(1x256, r1230, dword, float, 256, TENSOR_REORDER_OCCUPANCY)
-DEFINE_GENERAL_4D_REORDER_KERNEL(1x256, r1302, dword, float, 256, TENSOR_REORDER_OCCUPANCY)
-DEFINE_GENERAL_4D_REORDER_KERNEL(1x256, r1320, dword, float, 256, TENSOR_REORDER_OCCUPANCY)
-DEFINE_GENERAL_4D_REORDER_KERNEL(1x256, r2013, dword, float, 256, TENSOR_REORDER_OCCUPANCY)
-DEFINE_GENERAL_4D_REORDER_KERNEL(1x256, r2031, dword, float, 256, TENSOR_REORDER_OCCUPANCY)
-DEFINE_GENERAL_4D_REORDER_KERNEL(1x256, r2103, dword, float, 256, TENSOR_REORDER_OCCUPANCY)
-DEFINE_GENERAL_4D_REORDER_KERNEL(1x256, r2130, dword, float, 256, TENSOR_REORDER_OCCUPANCY)
-DEFINE_GENERAL_4D_REORDER_KERNEL(1x256, r2301, dword, float, 256, TENSOR_REORDER_OCCUPANCY)
-DEFINE_GENERAL_4D_REORDER_KERNEL(1x256, r2310, dword, float, 256, TENSOR_REORDER_OCCUPANCY)
-DEFINE_GENERAL_4D_REORDER_KERNEL(1x256, r3012, dword, float, 256, TENSOR_REORDER_OCCUPANCY)
-DEFINE_GENERAL_4D_REORDER_KERNEL(1x256, r3021, dword, float, 256, TENSOR_REORDER_OCCUPANCY)
-DEFINE_GENERAL_4D_REORDER_KERNEL(1x256, r3102, dword, float, 256, TENSOR_REORDER_OCCUPANCY)
-DEFINE_GENERAL_4D_REORDER_KERNEL(1x256, r3120, dword, float, 256, TENSOR_REORDER_OCCUPANCY)
-DEFINE_GENERAL_4D_REORDER_KERNEL(1x256, r3201, dword, float, 256, TENSOR_REORDER_OCCUPANCY)
-DEFINE_GENERAL_4D_REORDER_KERNEL(1x256, r3210, dword, float, 256, TENSOR_REORDER_OCCUPANCY)
-
-DEFINE_GENERAL_4D_REORDER_KERNEL(2x256, r0132, dword, float, 256, TENSOR_REORDER_OCCUPANCY)
-DEFINE_GENERAL_4D_REORDER_KERNEL(2x256, r0213, dword, float, 256, TENSOR_REORDER_OCCUPANCY)
-DEFINE_GENERAL_4D_REORDER_KERNEL(2x256, r0231, dword, float, 256, TENSOR_REORDER_OCCUPANCY)
-DEFINE_GENERAL_4D_REORDER_KERNEL(2x256, r0312, dword, float, 256, TENSOR_REORDER_OCCUPANCY)
-DEFINE_GENERAL_4D_REORDER_KERNEL(2x256, r0321, dword, float, 256, TENSOR_REORDER_OCCUPANCY)
-DEFINE_GENERAL_4D_REORDER_KERNEL(2x256, r1023, dword, float, 256, TENSOR_REORDER_OCCUPANCY)
-DEFINE_GENERAL_4D_REORDER_KERNEL(2x256, r1032, dword, float, 256, TENSOR_REORDER_OCCUPANCY)
-DEFINE_GENERAL_4D_REORDER_KERNEL(2x256, r1203, dword, float, 256, TENSOR_REORDER_OCCUPANCY)
-DEFINE_GENERAL_4D_REORDER_KERNEL(2x256, r1230, dword, float, 256, TENSOR_REORDER_OCCUPANCY)
-DEFINE_GENERAL_4D_REORDER_KERNEL(2x256, r1302, dword, float, 256, TENSOR_REORDER_OCCUPANCY)
-DEFINE_GENERAL_4D_REORDER_KERNEL(2x256, r1320, dword, float, 256, TENSOR_REORDER_OCCUPANCY)
-DEFINE_GENERAL_4D_REORDER_KERNEL(2x256, r2013, dword, float, 256, TENSOR_REORDER_OCCUPANCY)
-DEFINE_GENERAL_4D_REORDER_KERNEL(2x256, r2031, dword, float, 256, TENSOR_REORDER_OCCUPANCY)
-DEFINE_GENERAL_4D_REORDER_KERNEL(2x256, r2103, dword, float, 256, TENSOR_REORDER_OCCUPANCY)
-DEFINE_GENERAL_4D_REORDER_KERNEL(2x256, r2130, dword, float, 256, TENSOR_REORDER_OCCUPANCY)
-DEFINE_GENERAL_4D_REORDER_KERNEL(2x256, r2301, dword, float, 256, TENSOR_REORDER_OCCUPANCY)
-DEFINE_GENERAL_4D_REORDER_KERNEL(2x256, r2310, dword, float, 256, TENSOR_REORDER_OCCUPANCY)
-DEFINE_GENERAL_4D_REORDER_KERNEL(2x256, r3012, dword, float, 256, TENSOR_REORDER_OCCUPANCY)
-DEFINE_GENERAL_4D_REORDER_KERNEL(2x256, r3021, dword, float, 256, TENSOR_REORDER_OCCUPANCY)
-DEFINE_GENERAL_4D_REORDER_KERNEL(2x256, r3102, dword, float, 256, TENSOR_REORDER_OCCUPANCY)
-DEFINE_GENERAL_4D_REORDER_KERNEL(2x256, r3120, dword, float, 256, TENSOR_REORDER_OCCUPANCY)
-DEFINE_GENERAL_4D_REORDER_KERNEL(2x256, r3201, dword, float, 256, TENSOR_REORDER_OCCUPANCY)
-DEFINE_GENERAL_4D_REORDER_KERNEL(2x256, r3210, dword, float, 256, TENSOR_REORDER_OCCUPANCY)
-
-DEFINE_GENERAL_4D_REORDER_KERNEL(4x256, r0132, dword, float, 256, TENSOR_REORDER_OCCUPANCY)
-DEFINE_GENERAL_4D_REORDER_KERNEL(4x256, r0213, dword, float, 256, TENSOR_REORDER_OCCUPANCY)
-DEFINE_GENERAL_4D_REORDER_KERNEL(4x256, r0231, dword, float, 256, TENSOR_REORDER_OCCUPANCY)
-DEFINE_GENERAL_4D_REORDER_KERNEL(4x256, r0312, dword, float, 256, TENSOR_REORDER_OCCUPANCY)
-DEFINE_GENERAL_4D_REORDER_KERNEL(4x256, r0321, dword, float, 256, TENSOR_REORDER_OCCUPANCY)
-DEFINE_GENERAL_4D_REORDER_KERNEL(4x256, r1023, dword, float, 256, TENSOR_REORDER_OCCUPANCY)
-DEFINE_GENERAL_4D_REORDER_KERNEL(4x256, r1032, dword, float, 256, TENSOR_REORDER_OCCUPANCY)
-DEFINE_GENERAL_4D_REORDER_KERNEL(4x256, r1203, dword, float, 256, TENSOR_REORDER_OCCUPANCY)
-DEFINE_GENERAL_4D_REORDER_KERNEL(4x256, r1230, dword, float, 256, TENSOR_REORDER_OCCUPANCY)
-DEFINE_GENERAL_4D_REORDER_KERNEL(4x256, r1302, dword, float, 256, TENSOR_REORDER_OCCUPANCY)
-DEFINE_GENERAL_4D_REORDER_KERNEL(4x256, r1320, dword, float, 256, TENSOR_REORDER_OCCUPANCY)
-DEFINE_GENERAL_4D_REORDER_KERNEL(4x256, r2013, dword, float, 256, TENSOR_REORDER_OCCUPANCY)
-DEFINE_GENERAL_4D_REORDER_KERNEL(4x256, r2031, dword, float, 256, TENSOR_REORDER_OCCUPANCY)
-DEFINE_GENERAL_4D_REORDER_KERNEL(4x256, r2103, dword, float, 256, TENSOR_REORDER_OCCUPANCY)
-DEFINE_GENERAL_4D_REORDER_KERNEL(4x256, r2130, dword, float, 256, TENSOR_REORDER_OCCUPANCY)
-DEFINE_GENERAL_4D_REORDER_KERNEL(4x256, r2301, dword, float, 256, TENSOR_REORDER_OCCUPANCY)
-DEFINE_GENERAL_4D_REORDER_KERNEL(4x256, r2310, dword, float, 256, TENSOR_REORDER_OCCUPANCY)
-DEFINE_GENERAL_4D_REORDER_KERNEL(4x256, r3012, dword, float, 256, TENSOR_REORDER_OCCUPANCY)
-DEFINE_GENERAL_4D_REORDER_KERNEL(4x256, r3021, dword, float, 256, TENSOR_REORDER_OCCUPANCY)
-DEFINE_GENERAL_4D_REORDER_KERNEL(4x256, r3102, dword, float, 256, TENSOR_REORDER_OCCUPANCY)
-DEFINE_GENERAL_4D_REORDER_KERNEL(4x256, r3120, dword, float, 256, TENSOR_REORDER_OCCUPANCY)
-DEFINE_GENERAL_4D_REORDER_KERNEL(4x256, r3201, dword, float, 256, TENSOR_REORDER_OCCUPANCY)
-DEFINE_GENERAL_4D_REORDER_KERNEL(4x256, r3210, dword, float, 256, TENSOR_REORDER_OCCUPANCY)
-
-DEFINE_GENERAL_4D_REORDER_KERNEL(8x256, r0132, dword, float, 256, TENSOR_REORDER_OCCUPANCY)
-DEFINE_GENERAL_4D_REORDER_KERNEL(8x256, r0213, dword, float, 256, TENSOR_REORDER_OCCUPANCY)
-DEFINE_GENERAL_4D_REORDER_KERNEL(8x256, r0231, dword, float, 256, TENSOR_REORDER_OCCUPANCY)
-DEFINE_GENERAL_4D_REORDER_KERNEL(8x256, r0312, dword, float, 256, TENSOR_REORDER_OCCUPANCY)
-DEFINE_GENERAL_4D_REORDER_KERNEL(8x256, r0321, dword, float, 256, TENSOR_REORDER_OCCUPANCY)
-DEFINE_GENERAL_4D_REORDER_KERNEL(8x256, r1023, dword, float, 256, TENSOR_REORDER_OCCUPANCY)
-DEFINE_GENERAL_4D_REORDER_KERNEL(8x256, r1032, dword, float, 256, TENSOR_REORDER_OCCUPANCY)
-DEFINE_GENERAL_4D_REORDER_KERNEL(8x256, r1203, dword, float, 256, TENSOR_REORDER_OCCUPANCY)
-DEFINE_GENERAL_4D_REORDER_KERNEL(8x256, r1230, dword, float, 256, TENSOR_REORDER_OCCUPANCY)
-DEFINE_GENERAL_4D_REORDER_KERNEL(8x256, r1302, dword, float, 256, TENSOR_REORDER_OCCUPANCY)
-DEFINE_GENERAL_4D_REORDER_KERNEL(8x256, r1320, dword, float, 256, TENSOR_REORDER_OCCUPANCY)
-DEFINE_GENERAL_4D_REORDER_KERNEL(8x256, r2013, dword, float, 256, TENSOR_REORDER_OCCUPANCY)
-DEFINE_GENERAL_4D_REORDER_KERNEL(8x256, r2031, dword, float, 256, TENSOR_REORDER_OCCUPANCY)
-DEFINE_GENERAL_4D_REORDER_KERNEL(8x256, r2103, dword, float, 256, TENSOR_REORDER_OCCUPANCY)
-DEFINE_GENERAL_4D_REORDER_KERNEL(8x256, r2130, dword, float, 256, TENSOR_REORDER_OCCUPANCY)
-DEFINE_GENERAL_4D_REORDER_KERNEL(8x256, r2301, dword, float, 256, TENSOR_REORDER_OCCUPANCY)
-DEFINE_GENERAL_4D_REORDER_KERNEL(8x256, r2310, dword, float, 256, TENSOR_REORDER_OCCUPANCY)
-DEFINE_GENERAL_4D_REORDER_KERNEL(8x256, r3012, dword, float, 256, TENSOR_REORDER_OCCUPANCY)
-DEFINE_GENERAL_4D_REORDER_KERNEL(8x256, r3021, dword, float, 256, TENSOR_REORDER_OCCUPANCY)
-DEFINE_GENERAL_4D_REORDER_KERNEL(8x256, r3102, dword, float, 256, TENSOR_REORDER_OCCUPANCY)
-DEFINE_GENERAL_4D_REORDER_KERNEL(8x256, r3120, dword, float, 256, TENSOR_REORDER_OCCUPANCY)
-DEFINE_GENERAL_4D_REORDER_KERNEL(8x256, r3201, dword, float, 256, TENSOR_REORDER_OCCUPANCY)
-DEFINE_GENERAL_4D_REORDER_KERNEL(8x256, r3210, dword, float, 256, TENSOR_REORDER_OCCUPANCY)
-
-DEFINE_GENERAL_4D_REORDER_KERNEL(16x256, r0132, dword, float, 256, TENSOR_REORDER_OCCUPANCY)
-DEFINE_GENERAL_4D_REORDER_KERNEL(16x256, r0213, dword, float, 256, TENSOR_REORDER_OCCUPANCY)
-DEFINE_GENERAL_4D_REORDER_KERNEL(16x256, r0231, dword, float, 256, TENSOR_REORDER_OCCUPANCY)
-DEFINE_GENERAL_4D_REORDER_KERNEL(16x256, r0312, dword, float, 256, TENSOR_REORDER_OCCUPANCY)
-DEFINE_GENERAL_4D_REORDER_KERNEL(16x256, r0321, dword, float, 256, TENSOR_REORDER_OCCUPANCY)
-DEFINE_GENERAL_4D_REORDER_KERNEL(16x256, r1023, dword, float, 256, TENSOR_REORDER_OCCUPANCY)
-DEFINE_GENERAL_4D_REORDER_KERNEL(16x256, r1032, dword, float, 256, TENSOR_REORDER_OCCUPANCY)
-DEFINE_GENERAL_4D_REORDER_KERNEL(16x256, r1203, dword, float, 256, TENSOR_REORDER_OCCUPANCY)
-DEFINE_GENERAL_4D_REORDER_KERNEL(16x256, r1230, dword, float, 256, TENSOR_REORDER_OCCUPANCY)
-DEFINE_GENERAL_4D_REORDER_KERNEL(16x256, r1302, dword, float, 256, TENSOR_REORDER_OCCUPANCY)
-DEFINE_GENERAL_4D_REORDER_KERNEL(16x256, r1320, dword, float, 256, TENSOR_REORDER_OCCUPANCY)
-DEFINE_GENERAL_4D_REORDER_KERNEL(16x256, r2013, dword, float, 256, TENSOR_REORDER_OCCUPANCY)
-DEFINE_GENERAL_4D_REORDER_KERNEL(16x256, r2031, dword, float, 256, TENSOR_REORDER_OCCUPANCY)
-DEFINE_GENERAL_4D_REORDER_KERNEL(16x256, r2103, dword, float, 256, TENSOR_REORDER_OCCUPANCY)
-DEFINE_GENERAL_4D_REORDER_KERNEL(16x256, r2130, dword, float, 256, TENSOR_REORDER_OCCUPANCY)
-DEFINE_GENERAL_4D_REORDER_KERNEL(16x256, r2301, dword, float, 256, TENSOR_REORDER_OCCUPANCY)
-DEFINE_GENERAL_4D_REORDER_KERNEL(16x256, r2310, dword, float, 256, TENSOR_REORDER_OCCUPANCY)
-DEFINE_GENERAL_4D_REORDER_KERNEL(16x256, r3012, dword, float, 256, TENSOR_REORDER_OCCUPANCY)
-DEFINE_GENERAL_4D_REORDER_KERNEL(16x256, r3021, dword, float, 256, TENSOR_REORDER_OCCUPANCY)
-DEFINE_GENERAL_4D_REORDER_KERNEL(16x256, r3102, dword, float, 256, TENSOR_REORDER_OCCUPANCY)
-DEFINE_GENERAL_4D_REORDER_KERNEL(16x256, r3120, dword, float, 256, TENSOR_REORDER_OCCUPANCY)
-DEFINE_GENERAL_4D_REORDER_KERNEL(16x256, r3201, dword, float, 256, TENSOR_REORDER_OCCUPANCY)
-DEFINE_GENERAL_4D_REORDER_KERNEL(16x256, r3210, dword, float, 256, TENSOR_REORDER_OCCUPANCY)
-
-DEFINE_GENERAL_4D_REORDER_KERNEL(1x256, r0132, half, ushort, 256, TENSOR_REORDER_OCCUPANCY)
-DEFINE_GENERAL_4D_REORDER_KERNEL(1x256, r0213, half, ushort, 256, TENSOR_REORDER_OCCUPANCY)
-DEFINE_GENERAL_4D_REORDER_KERNEL(1x256, r0231, half, ushort, 256, TENSOR_REORDER_OCCUPANCY)
-DEFINE_GENERAL_4D_REORDER_KERNEL(1x256, r0312, half, ushort, 256, TENSOR_REORDER_OCCUPANCY)
-DEFINE_GENERAL_4D_REORDER_KERNEL(1x256, r0321, half, ushort, 256, TENSOR_REORDER_OCCUPANCY)
-DEFINE_GENERAL_4D_REORDER_KERNEL(1x256, r1023, half, ushort, 256, TENSOR_REORDER_OCCUPANCY)
-DEFINE_GENERAL_4D_REORDER_KERNEL(1x256, r1032, half, ushort, 256, TENSOR_REORDER_OCCUPANCY)
-DEFINE_GENERAL_4D_REORDER_KERNEL(1x256, r1203, half, ushort, 256, TENSOR_REORDER_OCCUPANCY)
-DEFINE_GENERAL_4D_REORDER_KERNEL(1x256, r1230, half, ushort, 256, TENSOR_REORDER_OCCUPANCY)
-DEFINE_GENERAL_4D_REORDER_KERNEL(1x256, r1302, half, ushort, 256, TENSOR_REORDER_OCCUPANCY)
-DEFINE_GENERAL_4D_REORDER_KERNEL(1x256, r1320, half, ushort, 256, TENSOR_REORDER_OCCUPANCY)
-DEFINE_GENERAL_4D_REORDER_KERNEL(1x256, r2013, half, ushort, 256, TENSOR_REORDER_OCCUPANCY)
-DEFINE_GENERAL_4D_REORDER_KERNEL(1x256, r2031, half, ushort, 256, TENSOR_REORDER_OCCUPANCY)
-DEFINE_GENERAL_4D_REORDER_KERNEL(1x256, r2103, half, ushort, 256, TENSOR_REORDER_OCCUPANCY)
-DEFINE_GENERAL_4D_REORDER_KERNEL(1x256, r2130, half, ushort, 256, TENSOR_REORDER_OCCUPANCY)
-DEFINE_GENERAL_4D_REORDER_KERNEL(1x256, r2301, half, ushort, 256, TENSOR_REORDER_OCCUPANCY)
-DEFINE_GENERAL_4D_REORDER_KERNEL(1x256, r2310, half, ushort, 256, TENSOR_REORDER_OCCUPANCY)
-DEFINE_GENERAL_4D_REORDER_KERNEL(1x256, r3012, half, ushort, 256, TENSOR_REORDER_OCCUPANCY)
-DEFINE_GENERAL_4D_REORDER_KERNEL(1x256, r3021, half, ushort, 256, TENSOR_REORDER_OCCUPANCY)
-DEFINE_GENERAL_4D_REORDER_KERNEL(1x256, r3102, half, ushort, 256, TENSOR_REORDER_OCCUPANCY)
-DEFINE_GENERAL_4D_REORDER_KERNEL(1x256, r3120, half, ushort, 256, TENSOR_REORDER_OCCUPANCY)
-DEFINE_GENERAL_4D_REORDER_KERNEL(1x256, r3201, half, ushort, 256, TENSOR_REORDER_OCCUPANCY)
-DEFINE_GENERAL_4D_REORDER_KERNEL(1x256, r3210, half, ushort, 256, TENSOR_REORDER_OCCUPANCY)
-
-DEFINE_GENERAL_4D_REORDER_KERNEL(2x256, r0132, half, ushort, 256, TENSOR_REORDER_OCCUPANCY)
-DEFINE_GENERAL_4D_REORDER_KERNEL(2x256, r0213, half, ushort, 256, TENSOR_REORDER_OCCUPANCY)
-DEFINE_GENERAL_4D_REORDER_KERNEL(2x256, r0231, half, ushort, 256, TENSOR_REORDER_OCCUPANCY)
-DEFINE_GENERAL_4D_REORDER_KERNEL(2x256, r0312, half, ushort, 256, TENSOR_REORDER_OCCUPANCY)
-DEFINE_GENERAL_4D_REORDER_KERNEL(2x256, r0321, half, ushort, 256, TENSOR_REORDER_OCCUPANCY)
-DEFINE_GENERAL_4D_REORDER_KERNEL(2x256, r1023, half, ushort, 256, TENSOR_REORDER_OCCUPANCY)
-DEFINE_GENERAL_4D_REORDER_KERNEL(2x256, r1032, half, ushort, 256, TENSOR_REORDER_OCCUPANCY)
-DEFINE_GENERAL_4D_REORDER_KERNEL(2x256, r1203, half, ushort, 256, TENSOR_REORDER_OCCUPANCY)
-DEFINE_GENERAL_4D_REORDER_KERNEL(2x256, r1230, half, ushort, 256, TENSOR_REORDER_OCCUPANCY)
-DEFINE_GENERAL_4D_REORDER_KERNEL(2x256, r1302, half, ushort, 256, TENSOR_REORDER_OCCUPANCY)
-DEFINE_GENERAL_4D_REORDER_KERNEL(2x256, r1320, half, ushort, 256, TENSOR_REORDER_OCCUPANCY)
-DEFINE_GENERAL_4D_REORDER_KERNEL(2x256, r2013, half, ushort, 256, TENSOR_REORDER_OCCUPANCY)
-DEFINE_GENERAL_4D_REORDER_KERNEL(2x256, r2031, half, ushort, 256, TENSOR_REORDER_OCCUPANCY)
-DEFINE_GENERAL_4D_REORDER_KERNEL(2x256, r2103, half, ushort, 256, TENSOR_REORDER_OCCUPANCY)
-DEFINE_GENERAL_4D_REORDER_KERNEL(2x256, r2130, half, ushort, 256, TENSOR_REORDER_OCCUPANCY)
-DEFINE_GENERAL_4D_REORDER_KERNEL(2x256, r2301, half, ushort, 256, TENSOR_REORDER_OCCUPANCY)
-DEFINE_GENERAL_4D_REORDER_KERNEL(2x256, r2310, half, ushort, 256, TENSOR_REORDER_OCCUPANCY)
-DEFINE_GENERAL_4D_REORDER_KERNEL(2x256, r3012, half, ushort, 256, TENSOR_REORDER_OCCUPANCY)
-DEFINE_GENERAL_4D_REORDER_KERNEL(2x256, r3021, half, ushort, 256, TENSOR_REORDER_OCCUPANCY)
-DEFINE_GENERAL_4D_REORDER_KERNEL(2x256, r3102, half, ushort, 256, TENSOR_REORDER_OCCUPANCY)
-DEFINE_GENERAL_4D_REORDER_KERNEL(2x256, r3120, half, ushort, 256, TENSOR_REORDER_OCCUPANCY)
-DEFINE_GENERAL_4D_REORDER_KERNEL(2x256, r3201, half, ushort, 256, TENSOR_REORDER_OCCUPANCY)
-DEFINE_GENERAL_4D_REORDER_KERNEL(2x256, r3210, half, ushort, 256, TENSOR_REORDER_OCCUPANCY)
-
-DEFINE_GENERAL_4D_REORDER_KERNEL(4x256, r0132, half, ushort, 256, TENSOR_REORDER_OCCUPANCY)
-DEFINE_GENERAL_4D_REORDER_KERNEL(4x256, r0213, half, ushort, 256, TENSOR_REORDER_OCCUPANCY)
-DEFINE_GENERAL_4D_REORDER_KERNEL(4x256, r0231, half, ushort, 256, TENSOR_REORDER_OCCUPANCY)
-DEFINE_GENERAL_4D_REORDER_KERNEL(4x256, r0312, half, ushort, 256, TENSOR_REORDER_OCCUPANCY)
-DEFINE_GENERAL_4D_REORDER_KERNEL(4x256, r0321, half, ushort, 256, TENSOR_REORDER_OCCUPANCY)
-DEFINE_GENERAL_4D_REORDER_KERNEL(4x256, r1023, half, ushort, 256, TENSOR_REORDER_OCCUPANCY)
-DEFINE_GENERAL_4D_REORDER_KERNEL(4x256, r1032, half, ushort, 256, TENSOR_REORDER_OCCUPANCY)
-DEFINE_GENERAL_4D_REORDER_KERNEL(4x256, r1203, half, ushort, 256, TENSOR_REORDER_OCCUPANCY)
-DEFINE_GENERAL_4D_REORDER_KERNEL(4x256, r1230, half, ushort, 256, TENSOR_REORDER_OCCUPANCY)
-DEFINE_GENERAL_4D_REORDER_KERNEL(4x256, r1302, half, ushort, 256, TENSOR_REORDER_OCCUPANCY)
-DEFINE_GENERAL_4D_REORDER_KERNEL(4x256, r1320, half, ushort, 256, TENSOR_REORDER_OCCUPANCY)
-DEFINE_GENERAL_4D_REORDER_KERNEL(4x256, r2013, half, ushort, 256, TENSOR_REORDER_OCCUPANCY)
-DEFINE_GENERAL_4D_REORDER_KERNEL(4x256, r2031, half, ushort, 256, TENSOR_REORDER_OCCUPANCY)
-DEFINE_GENERAL_4D_REORDER_KERNEL(4x256, r2103, half, ushort, 256, TENSOR_REORDER_OCCUPANCY)
-DEFINE_GENERAL_4D_REORDER_KERNEL(4x256, r2130, half, ushort, 256, TENSOR_REORDER_OCCUPANCY)
-DEFINE_GENERAL_4D_REORDER_KERNEL(4x256, r2301, half, ushort, 256, TENSOR_REORDER_OCCUPANCY)
-DEFINE_GENERAL_4D_REORDER_KERNEL(4x256, r2310, half, ushort, 256, TENSOR_REORDER_OCCUPANCY)
-DEFINE_GENERAL_4D_REORDER_KERNEL(4x256, r3012, half, ushort, 256, TENSOR_REORDER_OCCUPANCY)
-DEFINE_GENERAL_4D_REORDER_KERNEL(4x256, r3021, half, ushort, 256, TENSOR_REORDER_OCCUPANCY)
-DEFINE_GENERAL_4D_REORDER_KERNEL(4x256, r3102, half, ushort, 256, TENSOR_REORDER_OCCUPANCY)
-DEFINE_GENERAL_4D_REORDER_KERNEL(4x256, r3120, half, ushort, 256, TENSOR_REORDER_OCCUPANCY)
-DEFINE_GENERAL_4D_REORDER_KERNEL(4x256, r3201, half, ushort, 256, TENSOR_REORDER_OCCUPANCY)
-DEFINE_GENERAL_4D_REORDER_KERNEL(4x256, r3210, half, ushort, 256, TENSOR_REORDER_OCCUPANCY)
-
-DEFINE_GENERAL_4D_REORDER_KERNEL(8x256, r0132, half, ushort, 256, TENSOR_REORDER_OCCUPANCY)
-DEFINE_GENERAL_4D_REORDER_KERNEL(8x256, r0213, half, ushort, 256, TENSOR_REORDER_OCCUPANCY)
-DEFINE_GENERAL_4D_REORDER_KERNEL(8x256, r0231, half, ushort, 256, TENSOR_REORDER_OCCUPANCY)
-DEFINE_GENERAL_4D_REORDER_KERNEL(8x256, r0312, half, ushort, 256, TENSOR_REORDER_OCCUPANCY)
-DEFINE_GENERAL_4D_REORDER_KERNEL(8x256, r0321, half, ushort, 256, TENSOR_REORDER_OCCUPANCY)
-DEFINE_GENERAL_4D_REORDER_KERNEL(8x256, r1023, half, ushort, 256, TENSOR_REORDER_OCCUPANCY)
-DEFINE_GENERAL_4D_REORDER_KERNEL(8x256, r1032, half, ushort, 256, TENSOR_REORDER_OCCUPANCY)
-DEFINE_GENERAL_4D_REORDER_KERNEL(8x256, r1203, half, ushort, 256, TENSOR_REORDER_OCCUPANCY)
-DEFINE_GENERAL_4D_REORDER_KERNEL(8x256, r1230, half, ushort, 256, TENSOR_REORDER_OCCUPANCY)
-DEFINE_GENERAL_4D_REORDER_KERNEL(8x256, r1302, half, ushort, 256, TENSOR_REORDER_OCCUPANCY)
-DEFINE_GENERAL_4D_REORDER_KERNEL(8x256, r1320, half, ushort, 256, TENSOR_REORDER_OCCUPANCY)
-DEFINE_GENERAL_4D_REORDER_KERNEL(8x256, r2013, half, ushort, 256, TENSOR_REORDER_OCCUPANCY)
-DEFINE_GENERAL_4D_REORDER_KERNEL(8x256, r2031, half, ushort, 256, TENSOR_REORDER_OCCUPANCY)
-DEFINE_GENERAL_4D_REORDER_KERNEL(8x256, r2103, half, ushort, 256, TENSOR_REORDER_OCCUPANCY)
-DEFINE_GENERAL_4D_REORDER_KERNEL(8x256, r2130, half, ushort, 256, TENSOR_REORDER_OCCUPANCY)
-DEFINE_GENERAL_4D_REORDER_KERNEL(8x256, r2301, half, ushort, 256, TENSOR_REORDER_OCCUPANCY)
-DEFINE_GENERAL_4D_REORDER_KERNEL(8x256, r2310, half, ushort, 256, TENSOR_REORDER_OCCUPANCY)
-DEFINE_GENERAL_4D_REORDER_KERNEL(8x256, r3012, half, ushort, 256, TENSOR_REORDER_OCCUPANCY)
-DEFINE_GENERAL_4D_REORDER_KERNEL(8x256, r3021, half, ushort, 256, TENSOR_REORDER_OCCUPANCY)
-DEFINE_GENERAL_4D_REORDER_KERNEL(8x256, r3102, half, ushort, 256, TENSOR_REORDER_OCCUPANCY)
-DEFINE_GENERAL_4D_REORDER_KERNEL(8x256, r3120, half, ushort, 256, TENSOR_REORDER_OCCUPANCY)
-DEFINE_GENERAL_4D_REORDER_KERNEL(8x256, r3201, half, ushort, 256, TENSOR_REORDER_OCCUPANCY)
-DEFINE_GENERAL_4D_REORDER_KERNEL(8x256, r3210, half, ushort, 256, TENSOR_REORDER_OCCUPANCY)
-
-DEFINE_GENERAL_4D_REORDER_KERNEL(16x256, r0132, half, ushort, 256, TENSOR_REORDER_OCCUPANCY)
-DEFINE_GENERAL_4D_REORDER_KERNEL(16x256, r0213, half, ushort, 256, TENSOR_REORDER_OCCUPANCY)
-DEFINE_GENERAL_4D_REORDER_KERNEL(16x256, r0231, half, ushort, 256, TENSOR_REORDER_OCCUPANCY)
-DEFINE_GENERAL_4D_REORDER_KERNEL(16x256, r0312, half, ushort, 256, TENSOR_REORDER_OCCUPANCY)
-DEFINE_GENERAL_4D_REORDER_KERNEL(16x256, r0321, half, ushort, 256, TENSOR_REORDER_OCCUPANCY)
-DEFINE_GENERAL_4D_REORDER_KERNEL(16x256, r1023, half, ushort, 256, TENSOR_REORDER_OCCUPANCY)
-DEFINE_GENERAL_4D_REORDER_KERNEL(16x256, r1032, half, ushort, 256, TENSOR_REORDER_OCCUPANCY)
-DEFINE_GENERAL_4D_REORDER_KERNEL(16x256, r1203, half, ushort, 256, TENSOR_REORDER_OCCUPANCY)
-DEFINE_GENERAL_4D_REORDER_KERNEL(16x256, r1230, half, ushort, 256, TENSOR_REORDER_OCCUPANCY)
-DEFINE_GENERAL_4D_REORDER_KERNEL(16x256, r1302, half, ushort, 256, TENSOR_REORDER_OCCUPANCY)
-DEFINE_GENERAL_4D_REORDER_KERNEL(16x256, r1320, half, ushort, 256, TENSOR_REORDER_OCCUPANCY)
-DEFINE_GENERAL_4D_REORDER_KERNEL(16x256, r2013, half, ushort, 256, TENSOR_REORDER_OCCUPANCY)
-DEFINE_GENERAL_4D_REORDER_KERNEL(16x256, r2031, half, ushort, 256, TENSOR_REORDER_OCCUPANCY)
-DEFINE_GENERAL_4D_REORDER_KERNEL(16x256, r2103, half, ushort, 256, TENSOR_REORDER_OCCUPANCY)
-DEFINE_GENERAL_4D_REORDER_KERNEL(16x256, r2130, half, ushort, 256, TENSOR_REORDER_OCCUPANCY)
-DEFINE_GENERAL_4D_REORDER_KERNEL(16x256, r2301, half, ushort, 256, TENSOR_REORDER_OCCUPANCY)
-DEFINE_GENERAL_4D_REORDER_KERNEL(16x256, r2310, half, ushort, 256, TENSOR_REORDER_OCCUPANCY)
-DEFINE_GENERAL_4D_REORDER_KERNEL(16x256, r3012, half, ushort, 256, TENSOR_REORDER_OCCUPANCY)
-DEFINE_GENERAL_4D_REORDER_KERNEL(16x256, r3021, half, ushort, 256, TENSOR_REORDER_OCCUPANCY)
-DEFINE_GENERAL_4D_REORDER_KERNEL(16x256, r3102, half, ushort, 256, TENSOR_REORDER_OCCUPANCY)
-DEFINE_GENERAL_4D_REORDER_KERNEL(16x256, r3120, half, ushort, 256, TENSOR_REORDER_OCCUPANCY)
-DEFINE_GENERAL_4D_REORDER_KERNEL(16x256, r3201, half, ushort, 256, TENSOR_REORDER_OCCUPANCY)
-DEFINE_GENERAL_4D_REORDER_KERNEL(16x256, r3210, half, ushort, 256, TENSOR_REORDER_OCCUPANCY)
-
-DEFINE_GENERAL_4D_REORDER_KERNEL(1x256, r0132, byte, uchar, 256, TENSOR_REORDER_OCCUPANCY)
-DEFINE_GENERAL_4D_REORDER_KERNEL(1x256, r0213, byte, uchar, 256, TENSOR_REORDER_OCCUPANCY)
-DEFINE_GENERAL_4D_REORDER_KERNEL(1x256, r0231, byte, uchar, 256, TENSOR_REORDER_OCCUPANCY)
-DEFINE_GENERAL_4D_REORDER_KERNEL(1x256, r0312, byte, uchar, 256, TENSOR_REORDER_OCCUPANCY)
-DEFINE_GENERAL_4D_REORDER_KERNEL(1x256, r0321, byte, uchar, 256, TENSOR_REORDER_OCCUPANCY)
-DEFINE_GENERAL_4D_REORDER_KERNEL(1x256, r1023, byte, uchar, 256, TENSOR_REORDER_OCCUPANCY)
-DEFINE_GENERAL_4D_REORDER_KERNEL(1x256, r1032, byte, uchar, 256, TENSOR_REORDER_OCCUPANCY)
-DEFINE_GENERAL_4D_REORDER_KERNEL(1x256, r1203, byte, uchar, 256, TENSOR_REORDER_OCCUPANCY)
-DEFINE_GENERAL_4D_REORDER_KERNEL(1x256, r1230, byte, uchar, 256, TENSOR_REORDER_OCCUPANCY)
-DEFINE_GENERAL_4D_REORDER_KERNEL(1x256, r1302, byte, uchar, 256, TENSOR_REORDER_OCCUPANCY)
-DEFINE_GENERAL_4D_REORDER_KERNEL(1x256, r1320, byte, uchar, 256, TENSOR_REORDER_OCCUPANCY)
-DEFINE_GENERAL_4D_REORDER_KERNEL(1x256, r2013, byte, uchar, 256, TENSOR_REORDER_OCCUPANCY)
-DEFINE_GENERAL_4D_REORDER_KERNEL(1x256, r2031, byte, uchar, 256, TENSOR_REORDER_OCCUPANCY)
-DEFINE_GENERAL_4D_REORDER_KERNEL(1x256, r2103, byte, uchar, 256, TENSOR_REORDER_OCCUPANCY)
-DEFINE_GENERAL_4D_REORDER_KERNEL(1x256, r2130, byte, uchar, 256, TENSOR_REORDER_OCCUPANCY)
-DEFINE_GENERAL_4D_REORDER_KERNEL(1x256, r2301, byte, uchar, 256, TENSOR_REORDER_OCCUPANCY)
-DEFINE_GENERAL_4D_REORDER_KERNEL(1x256, r2310, byte, uchar, 256, TENSOR_REORDER_OCCUPANCY)
-DEFINE_GENERAL_4D_REORDER_KERNEL(1x256, r3012, byte, uchar, 256, TENSOR_REORDER_OCCUPANCY)
-DEFINE_GENERAL_4D_REORDER_KERNEL(1x256, r3021, byte, uchar, 256, TENSOR_REORDER_OCCUPANCY)
-DEFINE_GENERAL_4D_REORDER_KERNEL(1x256, r3102, byte, uchar, 256, TENSOR_REORDER_OCCUPANCY)
-DEFINE_GENERAL_4D_REORDER_KERNEL(1x256, r3120, byte, uchar, 256, TENSOR_REORDER_OCCUPANCY)
-DEFINE_GENERAL_4D_REORDER_KERNEL(1x256, r3201, byte, uchar, 256, TENSOR_REORDER_OCCUPANCY)
-DEFINE_GENERAL_4D_REORDER_KERNEL(1x256, r3210, byte, uchar, 256, TENSOR_REORDER_OCCUPANCY)
-
-DEFINE_GENERAL_4D_REORDER_KERNEL(2x256, r0132, byte, uchar, 256, TENSOR_REORDER_OCCUPANCY)
-DEFINE_GENERAL_4D_REORDER_KERNEL(2x256, r0213, byte, uchar, 256, TENSOR_REORDER_OCCUPANCY)
-DEFINE_GENERAL_4D_REORDER_KERNEL(2x256, r0231, byte, uchar, 256, TENSOR_REORDER_OCCUPANCY)
-DEFINE_GENERAL_4D_REORDER_KERNEL(2x256, r0312, byte, uchar, 256, TENSOR_REORDER_OCCUPANCY)
-DEFINE_GENERAL_4D_REORDER_KERNEL(2x256, r0321, byte, uchar, 256, TENSOR_REORDER_OCCUPANCY)
-DEFINE_GENERAL_4D_REORDER_KERNEL(2x256, r1023, byte, uchar, 256, TENSOR_REORDER_OCCUPANCY)
-DEFINE_GENERAL_4D_REORDER_KERNEL(2x256, r1032, byte, uchar, 256, TENSOR_REORDER_OCCUPANCY)
-DEFINE_GENERAL_4D_REORDER_KERNEL(2x256, r1203, byte, uchar, 256, TENSOR_REORDER_OCCUPANCY)
-DEFINE_GENERAL_4D_REORDER_KERNEL(2x256, r1230, byte, uchar, 256, TENSOR_REORDER_OCCUPANCY)
-DEFINE_GENERAL_4D_REORDER_KERNEL(2x256, r1302, byte, uchar, 256, TENSOR_REORDER_OCCUPANCY)
-DEFINE_GENERAL_4D_REORDER_KERNEL(2x256, r1320, byte, uchar, 256, TENSOR_REORDER_OCCUPANCY)
-DEFINE_GENERAL_4D_REORDER_KERNEL(2x256, r2013, byte, uchar, 256, TENSOR_REORDER_OCCUPANCY)
-DEFINE_GENERAL_4D_REORDER_KERNEL(2x256, r2031, byte, uchar, 256, TENSOR_REORDER_OCCUPANCY)
-DEFINE_GENERAL_4D_REORDER_KERNEL(2x256, r2103, byte, uchar, 256, TENSOR_REORDER_OCCUPANCY)
-DEFINE_GENERAL_4D_REORDER_KERNEL(2x256, r2130, byte, uchar, 256, TENSOR_REORDER_OCCUPANCY)
-DEFINE_GENERAL_4D_REORDER_KERNEL(2x256, r2301, byte, uchar, 256, TENSOR_REORDER_OCCUPANCY)
-DEFINE_GENERAL_4D_REORDER_KERNEL(2x256, r2310, byte, uchar, 256, TENSOR_REORDER_OCCUPANCY)
-DEFINE_GENERAL_4D_REORDER_KERNEL(2x256, r3012, byte, uchar, 256, TENSOR_REORDER_OCCUPANCY)
-DEFINE_GENERAL_4D_REORDER_KERNEL(2x256, r3021, byte, uchar, 256, TENSOR_REORDER_OCCUPANCY)
-DEFINE_GENERAL_4D_REORDER_KERNEL(2x256, r3102, byte, uchar, 256, TENSOR_REORDER_OCCUPANCY)
-DEFINE_GENERAL_4D_REORDER_KERNEL(2x256, r3120, byte, uchar, 256, TENSOR_REORDER_OCCUPANCY)
-DEFINE_GENERAL_4D_REORDER_KERNEL(2x256, r3201, byte, uchar, 256, TENSOR_REORDER_OCCUPANCY)
-DEFINE_GENERAL_4D_REORDER_KERNEL(2x256, r3210, byte, uchar, 256, TENSOR_REORDER_OCCUPANCY)
-
-DEFINE_GENERAL_4D_REORDER_KERNEL(4x256, r0132, byte, uchar, 256, TENSOR_REORDER_OCCUPANCY)
-DEFINE_GENERAL_4D_REORDER_KERNEL(4x256, r0213, byte, uchar, 256, TENSOR_REORDER_OCCUPANCY)
-DEFINE_GENERAL_4D_REORDER_KERNEL(4x256, r0231, byte, uchar, 256, TENSOR_REORDER_OCCUPANCY)
-DEFINE_GENERAL_4D_REORDER_KERNEL(4x256, r0312, byte, uchar, 256, TENSOR_REORDER_OCCUPANCY)
-DEFINE_GENERAL_4D_REORDER_KERNEL(4x256, r0321, byte, uchar, 256, TENSOR_REORDER_OCCUPANCY)
-DEFINE_GENERAL_4D_REORDER_KERNEL(4x256, r1023, byte, uchar, 256, TENSOR_REORDER_OCCUPANCY)
-DEFINE_GENERAL_4D_REORDER_KERNEL(4x256, r1032, byte, uchar, 256, TENSOR_REORDER_OCCUPANCY)
-DEFINE_GENERAL_4D_REORDER_KERNEL(4x256, r1203, byte, uchar, 256, TENSOR_REORDER_OCCUPANCY)
-DEFINE_GENERAL_4D_REORDER_KERNEL(4x256, r1230, byte, uchar, 256, TENSOR_REORDER_OCCUPANCY)
-DEFINE_GENERAL_4D_REORDER_KERNEL(4x256, r1302, byte, uchar, 256, TENSOR_REORDER_OCCUPANCY)
-DEFINE_GENERAL_4D_REORDER_KERNEL(4x256, r1320, byte, uchar, 256, TENSOR_REORDER_OCCUPANCY)
-DEFINE_GENERAL_4D_REORDER_KERNEL(4x256, r2013, byte, uchar, 256, TENSOR_REORDER_OCCUPANCY)
-DEFINE_GENERAL_4D_REORDER_KERNEL(4x256, r2031, byte, uchar, 256, TENSOR_REORDER_OCCUPANCY)
-DEFINE_GENERAL_4D_REORDER_KERNEL(4x256, r2103, byte, uchar, 256, TENSOR_REORDER_OCCUPANCY)
-DEFINE_GENERAL_4D_REORDER_KERNEL(4x256, r2130, byte, uchar, 256, TENSOR_REORDER_OCCUPANCY)
-DEFINE_GENERAL_4D_REORDER_KERNEL(4x256, r2301, byte, uchar, 256, TENSOR_REORDER_OCCUPANCY)
-DEFINE_GENERAL_4D_REORDER_KERNEL(4x256, r2310, byte, uchar, 256, TENSOR_REORDER_OCCUPANCY)
-DEFINE_GENERAL_4D_REORDER_KERNEL(4x256, r3012, byte, uchar, 256, TENSOR_REORDER_OCCUPANCY)
-DEFINE_GENERAL_4D_REORDER_KERNEL(4x256, r3021, byte, uchar, 256, TENSOR_REORDER_OCCUPANCY)
-DEFINE_GENERAL_4D_REORDER_KERNEL(4x256, r3102, byte, uchar, 256, TENSOR_REORDER_OCCUPANCY)
-DEFINE_GENERAL_4D_REORDER_KERNEL(4x256, r3120, byte, uchar, 256, TENSOR_REORDER_OCCUPANCY)
-DEFINE_GENERAL_4D_REORDER_KERNEL(4x256, r3201, byte, uchar, 256, TENSOR_REORDER_OCCUPANCY)
-DEFINE_GENERAL_4D_REORDER_KERNEL(4x256, r3210, byte, uchar, 256, TENSOR_REORDER_OCCUPANCY)
-
-DEFINE_GENERAL_4D_REORDER_KERNEL(8x256, r0132, byte, uchar, 256, TENSOR_REORDER_OCCUPANCY)
-DEFINE_GENERAL_4D_REORDER_KERNEL(8x256, r0213, byte, uchar, 256, TENSOR_REORDER_OCCUPANCY)
-DEFINE_GENERAL_4D_REORDER_KERNEL(8x256, r0231, byte, uchar, 256, TENSOR_REORDER_OCCUPANCY)
-DEFINE_GENERAL_4D_REORDER_KERNEL(8x256, r0312, byte, uchar, 256, TENSOR_REORDER_OCCUPANCY)
-DEFINE_GENERAL_4D_REORDER_KERNEL(8x256, r0321, byte, uchar, 256, TENSOR_REORDER_OCCUPANCY)
-DEFINE_GENERAL_4D_REORDER_KERNEL(8x256, r1023, byte, uchar, 256, TENSOR_REORDER_OCCUPANCY)
-DEFINE_GENERAL_4D_REORDER_KERNEL(8x256, r1032, byte, uchar, 256, TENSOR_REORDER_OCCUPANCY)
-DEFINE_GENERAL_4D_REORDER_KERNEL(8x256, r1203, byte, uchar, 256, TENSOR_REORDER_OCCUPANCY)
-DEFINE_GENERAL_4D_REORDER_KERNEL(8x256, r1230, byte, uchar, 256, TENSOR_REORDER_OCCUPANCY)
-DEFINE_GENERAL_4D_REORDER_KERNEL(8x256, r1302, byte, uchar, 256, TENSOR_REORDER_OCCUPANCY)
-DEFINE_GENERAL_4D_REORDER_KERNEL(8x256, r1320, byte, uchar, 256, TENSOR_REORDER_OCCUPANCY)
-DEFINE_GENERAL_4D_REORDER_KERNEL(8x256, r2013, byte, uchar, 256, TENSOR_REORDER_OCCUPANCY)
-DEFINE_GENERAL_4D_REORDER_KERNEL(8x256, r2031, byte, uchar, 256, TENSOR_REORDER_OCCUPANCY)
-DEFINE_GENERAL_4D_REORDER_KERNEL(8x256, r2103, byte, uchar, 256, TENSOR_REORDER_OCCUPANCY)
-DEFINE_GENERAL_4D_REORDER_KERNEL(8x256, r2130, byte, uchar, 256, TENSOR_REORDER_OCCUPANCY)
-DEFINE_GENERAL_4D_REORDER_KERNEL(8x256, r2301, byte, uchar, 256, TENSOR_REORDER_OCCUPANCY)
-DEFINE_GENERAL_4D_REORDER_KERNEL(8x256, r2310, byte, uchar, 256, TENSOR_REORDER_OCCUPANCY)
-DEFINE_GENERAL_4D_REORDER_KERNEL(8x256, r3012, byte, uchar, 256, TENSOR_REORDER_OCCUPANCY)
-DEFINE_GENERAL_4D_REORDER_KERNEL(8x256, r3021, byte, uchar, 256, TENSOR_REORDER_OCCUPANCY)
-DEFINE_GENERAL_4D_REORDER_KERNEL(8x256, r3102, byte, uchar, 256, TENSOR_REORDER_OCCUPANCY)
-DEFINE_GENERAL_4D_REORDER_KERNEL(8x256, r3120, byte, uchar, 256, TENSOR_REORDER_OCCUPANCY)
-DEFINE_GENERAL_4D_REORDER_KERNEL(8x256, r3201, byte, uchar, 256, TENSOR_REORDER_OCCUPANCY)
-DEFINE_GENERAL_4D_REORDER_KERNEL(8x256, r3210, byte, uchar, 256, TENSOR_REORDER_OCCUPANCY)
-
-DEFINE_GENERAL_4D_REORDER_KERNEL(16x256, r0132, byte, uchar, 256, TENSOR_REORDER_OCCUPANCY)
-DEFINE_GENERAL_4D_REORDER_KERNEL(16x256, r0213, byte, uchar, 256, TENSOR_REORDER_OCCUPANCY)
-DEFINE_GENERAL_4D_REORDER_KERNEL(16x256, r0231, byte, uchar, 256, TENSOR_REORDER_OCCUPANCY)
-DEFINE_GENERAL_4D_REORDER_KERNEL(16x256, r0312, byte, uchar, 256, TENSOR_REORDER_OCCUPANCY)
-DEFINE_GENERAL_4D_REORDER_KERNEL(16x256, r0321, byte, uchar, 256, TENSOR_REORDER_OCCUPANCY)
-DEFINE_GENERAL_4D_REORDER_KERNEL(16x256, r1023, byte, uchar, 256, TENSOR_REORDER_OCCUPANCY)
-DEFINE_GENERAL_4D_REORDER_KERNEL(16x256, r1032, byte, uchar, 256, TENSOR_REORDER_OCCUPANCY)
-DEFINE_GENERAL_4D_REORDER_KERNEL(16x256, r1203, byte, uchar, 256, TENSOR_REORDER_OCCUPANCY)
-DEFINE_GENERAL_4D_REORDER_KERNEL(16x256, r1230, byte, uchar, 256, TENSOR_REORDER_OCCUPANCY)
-DEFINE_GENERAL_4D_REORDER_KERNEL(16x256, r1302, byte, uchar, 256, TENSOR_REORDER_OCCUPANCY)
-DEFINE_GENERAL_4D_REORDER_KERNEL(16x256, r1320, byte, uchar, 256, TENSOR_REORDER_OCCUPANCY)
-DEFINE_GENERAL_4D_REORDER_KERNEL(16x256, r2013, byte, uchar, 256, TENSOR_REORDER_OCCUPANCY)
-DEFINE_GENERAL_4D_REORDER_KERNEL(16x256, r2031, byte, uchar, 256, TENSOR_REORDER_OCCUPANCY)
-DEFINE_GENERAL_4D_REORDER_KERNEL(16x256, r2103, byte, uchar, 256, TENSOR_REORDER_OCCUPANCY)
-DEFINE_GENERAL_4D_REORDER_KERNEL(16x256, r2130, byte, uchar, 256, TENSOR_REORDER_OCCUPANCY)
-DEFINE_GENERAL_4D_REORDER_KERNEL(16x256, r2301, byte, uchar, 256, TENSOR_REORDER_OCCUPANCY)
-DEFINE_GENERAL_4D_REORDER_KERNEL(16x256, r2310, byte, uchar, 256, TENSOR_REORDER_OCCUPANCY)
-DEFINE_GENERAL_4D_REORDER_KERNEL(16x256, r3012, byte, uchar, 256, TENSOR_REORDER_OCCUPANCY)
-DEFINE_GENERAL_4D_REORDER_KERNEL(16x256, r3021, byte, uchar, 256, TENSOR_REORDER_OCCUPANCY)
-DEFINE_GENERAL_4D_REORDER_KERNEL(16x256, r3102, byte, uchar, 256, TENSOR_REORDER_OCCUPANCY)
-DEFINE_GENERAL_4D_REORDER_KERNEL(16x256, r3120, byte, uchar, 256, TENSOR_REORDER_OCCUPANCY)
-DEFINE_GENERAL_4D_REORDER_KERNEL(16x256, r3201, byte, uchar, 256, TENSOR_REORDER_OCCUPANCY)
-DEFINE_GENERAL_4D_REORDER_KERNEL(16x256, r3210, byte, uchar, 256, TENSOR_REORDER_OCCUPANCY)
diff --git a/src/kernels/gpu_general_tensor_reorder_kernel/general_tensor_reorder_16x256_byte.cpp b/src/kernels/gpu_general_tensor_reorder_kernel/general_tensor_reorder_16x256_byte.cpp
new file mode 100644
index 0000000000..dc39823ab8
--- /dev/null
+++ b/src/kernels/gpu_general_tensor_reorder_kernel/general_tensor_reorder_16x256_byte.cpp
@@ -0,0 +1,49 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2020-2022 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+#include "general_tensor_reorder_kernel_util.hpp"
+DEFINE_GENERAL_4D_REORDER_KERNEL(16x256, r0132, byte, uchar, 256, TENSOR_REORDER_OCCUPANCY)
+DEFINE_GENERAL_4D_REORDER_KERNEL(16x256, r0213, byte, uchar, 256, TENSOR_REORDER_OCCUPANCY)
+DEFINE_GENERAL_4D_REORDER_KERNEL(16x256, r0231, byte, uchar, 256, TENSOR_REORDER_OCCUPANCY)
+DEFINE_GENERAL_4D_REORDER_KERNEL(16x256, r0312, byte, uchar, 256, TENSOR_REORDER_OCCUPANCY)
+DEFINE_GENERAL_4D_REORDER_KERNEL(16x256, r0321, byte, uchar, 256, TENSOR_REORDER_OCCUPANCY)
+DEFINE_GENERAL_4D_REORDER_KERNEL(16x256, r1023, byte, uchar, 256, TENSOR_REORDER_OCCUPANCY)
+DEFINE_GENERAL_4D_REORDER_KERNEL(16x256, r1032, byte, uchar, 256, TENSOR_REORDER_OCCUPANCY)
+DEFINE_GENERAL_4D_REORDER_KERNEL(16x256, r1203, byte, uchar, 256, TENSOR_REORDER_OCCUPANCY)
+DEFINE_GENERAL_4D_REORDER_KERNEL(16x256, r1230, byte, uchar, 256, TENSOR_REORDER_OCCUPANCY)
+DEFINE_GENERAL_4D_REORDER_KERNEL(16x256, r1302, byte, uchar, 256, TENSOR_REORDER_OCCUPANCY)
+DEFINE_GENERAL_4D_REORDER_KERNEL(16x256, r1320, byte, uchar, 256, TENSOR_REORDER_OCCUPANCY)
+DEFINE_GENERAL_4D_REORDER_KERNEL(16x256, r2013, byte, uchar, 256, TENSOR_REORDER_OCCUPANCY)
+DEFINE_GENERAL_4D_REORDER_KERNEL(16x256, r2031, byte, uchar, 256, TENSOR_REORDER_OCCUPANCY)
+DEFINE_GENERAL_4D_REORDER_KERNEL(16x256, r2103, byte, uchar, 256, TENSOR_REORDER_OCCUPANCY)
+DEFINE_GENERAL_4D_REORDER_KERNEL(16x256, r2130, byte, uchar, 256, TENSOR_REORDER_OCCUPANCY)
+DEFINE_GENERAL_4D_REORDER_KERNEL(16x256, r2301, byte, uchar, 256, TENSOR_REORDER_OCCUPANCY)
+DEFINE_GENERAL_4D_REORDER_KERNEL(16x256, r2310, byte, uchar, 256, TENSOR_REORDER_OCCUPANCY)
+DEFINE_GENERAL_4D_REORDER_KERNEL(16x256, r3012, byte, uchar, 256, TENSOR_REORDER_OCCUPANCY)
+DEFINE_GENERAL_4D_REORDER_KERNEL(16x256, r3021, byte, uchar, 256, TENSOR_REORDER_OCCUPANCY)
+DEFINE_GENERAL_4D_REORDER_KERNEL(16x256, r3102, byte, uchar, 256, TENSOR_REORDER_OCCUPANCY)
+DEFINE_GENERAL_4D_REORDER_KERNEL(16x256, r3120, byte, uchar, 256, TENSOR_REORDER_OCCUPANCY)
+DEFINE_GENERAL_4D_REORDER_KERNEL(16x256, r3201, byte, uchar, 256, TENSOR_REORDER_OCCUPANCY)
+DEFINE_GENERAL_4D_REORDER_KERNEL(16x256, r3210, byte, uchar, 256, TENSOR_REORDER_OCCUPANCY)
diff --git a/src/kernels/gpu_general_tensor_reorder_kernel/general_tensor_reorder_16x256_dword.cpp b/src/kernels/gpu_general_tensor_reorder_kernel/general_tensor_reorder_16x256_dword.cpp
new file mode 100644
index 0000000000..fd868304ae
--- /dev/null
+++ b/src/kernels/gpu_general_tensor_reorder_kernel/general_tensor_reorder_16x256_dword.cpp
@@ -0,0 +1,49 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2020-2022 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+#include "general_tensor_reorder_kernel_util.hpp"
+DEFINE_GENERAL_4D_REORDER_KERNEL(16x256, r0132, dword, float, 256, TENSOR_REORDER_OCCUPANCY)
+DEFINE_GENERAL_4D_REORDER_KERNEL(16x256, r0213, dword, float, 256, TENSOR_REORDER_OCCUPANCY)
+DEFINE_GENERAL_4D_REORDER_KERNEL(16x256, r0231, dword, float, 256, TENSOR_REORDER_OCCUPANCY)
+DEFINE_GENERAL_4D_REORDER_KERNEL(16x256, r0312, dword, float, 256, TENSOR_REORDER_OCCUPANCY)
+DEFINE_GENERAL_4D_REORDER_KERNEL(16x256, r0321, dword, float, 256, TENSOR_REORDER_OCCUPANCY)
+DEFINE_GENERAL_4D_REORDER_KERNEL(16x256, r1023, dword, float, 256, TENSOR_REORDER_OCCUPANCY)
+DEFINE_GENERAL_4D_REORDER_KERNEL(16x256, r1032, dword, float, 256, TENSOR_REORDER_OCCUPANCY)
+DEFINE_GENERAL_4D_REORDER_KERNEL(16x256, r1203, dword, float, 256, TENSOR_REORDER_OCCUPANCY)
+DEFINE_GENERAL_4D_REORDER_KERNEL(16x256, r1230, dword, float, 256, TENSOR_REORDER_OCCUPANCY)
+DEFINE_GENERAL_4D_REORDER_KERNEL(16x256, r1302, dword, float, 256, TENSOR_REORDER_OCCUPANCY)
+DEFINE_GENERAL_4D_REORDER_KERNEL(16x256, r1320, dword, float, 256, TENSOR_REORDER_OCCUPANCY)
+DEFINE_GENERAL_4D_REORDER_KERNEL(16x256, r2013, dword, float, 256, TENSOR_REORDER_OCCUPANCY)
+DEFINE_GENERAL_4D_REORDER_KERNEL(16x256, r2031, dword, float, 256, TENSOR_REORDER_OCCUPANCY)
+DEFINE_GENERAL_4D_REORDER_KERNEL(16x256, r2103, dword, float, 256, TENSOR_REORDER_OCCUPANCY)
+DEFINE_GENERAL_4D_REORDER_KERNEL(16x256, r2130, dword, float, 256, TENSOR_REORDER_OCCUPANCY)
+DEFINE_GENERAL_4D_REORDER_KERNEL(16x256, r2301, dword, float, 256, TENSOR_REORDER_OCCUPANCY)
+DEFINE_GENERAL_4D_REORDER_KERNEL(16x256, r2310, dword, float, 256, TENSOR_REORDER_OCCUPANCY)
+DEFINE_GENERAL_4D_REORDER_KERNEL(16x256, r3012, dword, float, 256, TENSOR_REORDER_OCCUPANCY)
+DEFINE_GENERAL_4D_REORDER_KERNEL(16x256, r3021, dword, float, 256, TENSOR_REORDER_OCCUPANCY)
+DEFINE_GENERAL_4D_REORDER_KERNEL(16x256, r3102, dword, float, 256, TENSOR_REORDER_OCCUPANCY)
+DEFINE_GENERAL_4D_REORDER_KERNEL(16x256, r3120, dword, float, 256, TENSOR_REORDER_OCCUPANCY)
+DEFINE_GENERAL_4D_REORDER_KERNEL(16x256, r3201, dword, float, 256, TENSOR_REORDER_OCCUPANCY)
+DEFINE_GENERAL_4D_REORDER_KERNEL(16x256, r3210, dword, float, 256, TENSOR_REORDER_OCCUPANCY)
diff --git a/src/kernels/gpu_general_tensor_reorder_kernel/general_tensor_reorder_16x256_dwordx2.cpp b/src/kernels/gpu_general_tensor_reorder_kernel/general_tensor_reorder_16x256_dwordx2.cpp
new file mode 100644
index 0000000000..fc909017b4
--- /dev/null
+++ b/src/kernels/gpu_general_tensor_reorder_kernel/general_tensor_reorder_16x256_dwordx2.cpp
@@ -0,0 +1,49 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2020-2022 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+#include "general_tensor_reorder_kernel_util.hpp"
+DEFINE_GENERAL_4D_REORDER_KERNEL(16x256, r0132, dwordx2, double, 256, TENSOR_REORDER_OCCUPANCY)
+DEFINE_GENERAL_4D_REORDER_KERNEL(16x256, r0213, dwordx2, double, 256, TENSOR_REORDER_OCCUPANCY)
+DEFINE_GENERAL_4D_REORDER_KERNEL(16x256, r0231, dwordx2, double, 256, TENSOR_REORDER_OCCUPANCY)
+DEFINE_GENERAL_4D_REORDER_KERNEL(16x256, r0312, dwordx2, double, 256, TENSOR_REORDER_OCCUPANCY)
+DEFINE_GENERAL_4D_REORDER_KERNEL(16x256, r0321, dwordx2, double, 256, TENSOR_REORDER_OCCUPANCY)
+DEFINE_GENERAL_4D_REORDER_KERNEL(16x256, r1023, dwordx2, double, 256, TENSOR_REORDER_OCCUPANCY)
+DEFINE_GENERAL_4D_REORDER_KERNEL(16x256, r1032, dwordx2, double, 256, TENSOR_REORDER_OCCUPANCY)
+DEFINE_GENERAL_4D_REORDER_KERNEL(16x256, r1203, dwordx2, double, 256, TENSOR_REORDER_OCCUPANCY)
+DEFINE_GENERAL_4D_REORDER_KERNEL(16x256, r1230, dwordx2, double, 256, TENSOR_REORDER_OCCUPANCY)
+DEFINE_GENERAL_4D_REORDER_KERNEL(16x256, r1302, dwordx2, double, 256, TENSOR_REORDER_OCCUPANCY)
+DEFINE_GENERAL_4D_REORDER_KERNEL(16x256, r1320, dwordx2, double, 256, TENSOR_REORDER_OCCUPANCY)
+DEFINE_GENERAL_4D_REORDER_KERNEL(16x256, r2013, dwordx2, double, 256, TENSOR_REORDER_OCCUPANCY)
+DEFINE_GENERAL_4D_REORDER_KERNEL(16x256, r2031, dwordx2, double, 256, TENSOR_REORDER_OCCUPANCY)
+DEFINE_GENERAL_4D_REORDER_KERNEL(16x256, r2103, dwordx2, double, 256, TENSOR_REORDER_OCCUPANCY)
+DEFINE_GENERAL_4D_REORDER_KERNEL(16x256, r2130, dwordx2, double, 256, TENSOR_REORDER_OCCUPANCY)
+DEFINE_GENERAL_4D_REORDER_KERNEL(16x256, r2301, dwordx2, double, 256, TENSOR_REORDER_OCCUPANCY)
+DEFINE_GENERAL_4D_REORDER_KERNEL(16x256, r2310, dwordx2, double, 256, TENSOR_REORDER_OCCUPANCY)
+DEFINE_GENERAL_4D_REORDER_KERNEL(16x256, r3012, dwordx2, double, 256, TENSOR_REORDER_OCCUPANCY)
+DEFINE_GENERAL_4D_REORDER_KERNEL(16x256, r3021, dwordx2, double, 256, TENSOR_REORDER_OCCUPANCY)
+DEFINE_GENERAL_4D_REORDER_KERNEL(16x256, r3102, dwordx2, double, 256, TENSOR_REORDER_OCCUPANCY)
+DEFINE_GENERAL_4D_REORDER_KERNEL(16x256, r3120, dwordx2, double, 256, TENSOR_REORDER_OCCUPANCY)
+DEFINE_GENERAL_4D_REORDER_KERNEL(16x256, r3201, dwordx2, double, 256, TENSOR_REORDER_OCCUPANCY)
+DEFINE_GENERAL_4D_REORDER_KERNEL(16x256, r3210, dwordx2, double, 256, TENSOR_REORDER_OCCUPANCY)
diff --git a/src/kernels/gpu_general_tensor_reorder_kernel/general_tensor_reorder_16x256_half.cpp b/src/kernels/gpu_general_tensor_reorder_kernel/general_tensor_reorder_16x256_half.cpp
new file mode 100644
index 0000000000..28c8d3f166
--- /dev/null
+++ b/src/kernels/gpu_general_tensor_reorder_kernel/general_tensor_reorder_16x256_half.cpp
@@ -0,0 +1,49 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2020-2022 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+#include "general_tensor_reorder_kernel_util.hpp"
+DEFINE_GENERAL_4D_REORDER_KERNEL(16x256, r0132, half, ushort, 256, TENSOR_REORDER_OCCUPANCY)
+DEFINE_GENERAL_4D_REORDER_KERNEL(16x256, r0213, half, ushort, 256, TENSOR_REORDER_OCCUPANCY)
+DEFINE_GENERAL_4D_REORDER_KERNEL(16x256, r0231, half, ushort, 256, TENSOR_REORDER_OCCUPANCY)
+DEFINE_GENERAL_4D_REORDER_KERNEL(16x256, r0312, half, ushort, 256, TENSOR_REORDER_OCCUPANCY)
+DEFINE_GENERAL_4D_REORDER_KERNEL(16x256, r0321, half, ushort, 256, TENSOR_REORDER_OCCUPANCY)
+DEFINE_GENERAL_4D_REORDER_KERNEL(16x256, r1023, half, ushort, 256, TENSOR_REORDER_OCCUPANCY)
+DEFINE_GENERAL_4D_REORDER_KERNEL(16x256, r1032, half, ushort, 256, TENSOR_REORDER_OCCUPANCY)
+DEFINE_GENERAL_4D_REORDER_KERNEL(16x256, r1203, half, ushort, 256, TENSOR_REORDER_OCCUPANCY)
+DEFINE_GENERAL_4D_REORDER_KERNEL(16x256, r1230, half, ushort, 256, TENSOR_REORDER_OCCUPANCY)
+DEFINE_GENERAL_4D_REORDER_KERNEL(16x256, r1302, half, ushort, 256, TENSOR_REORDER_OCCUPANCY)
+DEFINE_GENERAL_4D_REORDER_KERNEL(16x256, r1320, half, ushort, 256, TENSOR_REORDER_OCCUPANCY)
+DEFINE_GENERAL_4D_REORDER_KERNEL(16x256, r2013, half, ushort, 256, TENSOR_REORDER_OCCUPANCY)
+DEFINE_GENERAL_4D_REORDER_KERNEL(16x256, r2031, half, ushort, 256, TENSOR_REORDER_OCCUPANCY)
+DEFINE_GENERAL_4D_REORDER_KERNEL(16x256, r2103, half, ushort, 256, TENSOR_REORDER_OCCUPANCY)
+DEFINE_GENERAL_4D_REORDER_KERNEL(16x256, r2130, half, ushort, 256, TENSOR_REORDER_OCCUPANCY)
+DEFINE_GENERAL_4D_REORDER_KERNEL(16x256, r2301, half, ushort, 256, TENSOR_REORDER_OCCUPANCY)
+DEFINE_GENERAL_4D_REORDER_KERNEL(16x256, r2310, half, ushort, 256, TENSOR_REORDER_OCCUPANCY)
+DEFINE_GENERAL_4D_REORDER_KERNEL(16x256, r3012, half, ushort, 256, TENSOR_REORDER_OCCUPANCY)
+DEFINE_GENERAL_4D_REORDER_KERNEL(16x256, r3021, half, ushort, 256, TENSOR_REORDER_OCCUPANCY)
+DEFINE_GENERAL_4D_REORDER_KERNEL(16x256, r3102, half, ushort, 256, TENSOR_REORDER_OCCUPANCY)
+DEFINE_GENERAL_4D_REORDER_KERNEL(16x256, r3120, half, ushort, 256, TENSOR_REORDER_OCCUPANCY)
+DEFINE_GENERAL_4D_REORDER_KERNEL(16x256, r3201, half, ushort, 256, TENSOR_REORDER_OCCUPANCY)
+DEFINE_GENERAL_4D_REORDER_KERNEL(16x256, r3210, half, ushort, 256, TENSOR_REORDER_OCCUPANCY)
diff --git a/src/kernels/gpu_general_tensor_reorder_kernel/general_tensor_reorder_1x256_byte.cpp b/src/kernels/gpu_general_tensor_reorder_kernel/general_tensor_reorder_1x256_byte.cpp
new file mode 100644
index 0000000000..0c2c5d1109
--- /dev/null
+++ b/src/kernels/gpu_general_tensor_reorder_kernel/general_tensor_reorder_1x256_byte.cpp
@@ -0,0 +1,49 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2020-2022 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+#include "general_tensor_reorder_kernel_util.hpp"
+DEFINE_GENERAL_4D_REORDER_KERNEL(1x256, r0132, byte, uchar, 256, TENSOR_REORDER_OCCUPANCY)
+DEFINE_GENERAL_4D_REORDER_KERNEL(1x256, r0213, byte, uchar, 256, TENSOR_REORDER_OCCUPANCY)
+DEFINE_GENERAL_4D_REORDER_KERNEL(1x256, r0231, byte, uchar, 256, TENSOR_REORDER_OCCUPANCY)
+DEFINE_GENERAL_4D_REORDER_KERNEL(1x256, r0312, byte, uchar, 256, TENSOR_REORDER_OCCUPANCY)
+DEFINE_GENERAL_4D_REORDER_KERNEL(1x256, r0321, byte, uchar, 256, TENSOR_REORDER_OCCUPANCY)
+DEFINE_GENERAL_4D_REORDER_KERNEL(1x256, r1023, byte, uchar, 256, TENSOR_REORDER_OCCUPANCY)
+DEFINE_GENERAL_4D_REORDER_KERNEL(1x256, r1032, byte, uchar, 256, TENSOR_REORDER_OCCUPANCY)
+DEFINE_GENERAL_4D_REORDER_KERNEL(1x256, r1203, byte, uchar, 256, TENSOR_REORDER_OCCUPANCY)
+DEFINE_GENERAL_4D_REORDER_KERNEL(1x256, r1230, byte, uchar, 256, TENSOR_REORDER_OCCUPANCY)
+DEFINE_GENERAL_4D_REORDER_KERNEL(1x256, r1302, byte, uchar, 256, TENSOR_REORDER_OCCUPANCY)
+DEFINE_GENERAL_4D_REORDER_KERNEL(1x256, r1320, byte, uchar, 256, TENSOR_REORDER_OCCUPANCY)
+DEFINE_GENERAL_4D_REORDER_KERNEL(1x256, r2013, byte, uchar, 256, TENSOR_REORDER_OCCUPANCY)
+DEFINE_GENERAL_4D_REORDER_KERNEL(1x256, r2031, byte, uchar, 256, TENSOR_REORDER_OCCUPANCY)
+DEFINE_GENERAL_4D_REORDER_KERNEL(1x256, r2103, byte, uchar, 256, TENSOR_REORDER_OCCUPANCY)
+DEFINE_GENERAL_4D_REORDER_KERNEL(1x256, r2130, byte, uchar, 256, TENSOR_REORDER_OCCUPANCY)
+DEFINE_GENERAL_4D_REORDER_KERNEL(1x256, r2301, byte, uchar, 256, TENSOR_REORDER_OCCUPANCY)
+DEFINE_GENERAL_4D_REORDER_KERNEL(1x256, r2310, byte, uchar, 256, TENSOR_REORDER_OCCUPANCY)
+DEFINE_GENERAL_4D_REORDER_KERNEL(1x256, r3012, byte, uchar, 256, TENSOR_REORDER_OCCUPANCY)
+DEFINE_GENERAL_4D_REORDER_KERNEL(1x256, r3021, byte, uchar, 256, TENSOR_REORDER_OCCUPANCY)
+DEFINE_GENERAL_4D_REORDER_KERNEL(1x256, r3102, byte, uchar, 256, TENSOR_REORDER_OCCUPANCY)
+DEFINE_GENERAL_4D_REORDER_KERNEL(1x256, r3120, byte, uchar, 256, TENSOR_REORDER_OCCUPANCY)
+DEFINE_GENERAL_4D_REORDER_KERNEL(1x256, r3201, byte, uchar, 256, TENSOR_REORDER_OCCUPANCY)
+DEFINE_GENERAL_4D_REORDER_KERNEL(1x256, r3210, byte, uchar, 256, TENSOR_REORDER_OCCUPANCY)
diff --git a/src/kernels/gpu_general_tensor_reorder_kernel/general_tensor_reorder_1x256_dword.cpp b/src/kernels/gpu_general_tensor_reorder_kernel/general_tensor_reorder_1x256_dword.cpp
new file mode 100644
index 0000000000..8d90e2ff07
--- /dev/null
+++ b/src/kernels/gpu_general_tensor_reorder_kernel/general_tensor_reorder_1x256_dword.cpp
@@ -0,0 +1,49 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2020-2022 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+#include "general_tensor_reorder_kernel_util.hpp"
+DEFINE_GENERAL_4D_REORDER_KERNEL(1x256, r0132, dword, float, 256, TENSOR_REORDER_OCCUPANCY)
+DEFINE_GENERAL_4D_REORDER_KERNEL(1x256, r0213, dword, float, 256, TENSOR_REORDER_OCCUPANCY)
+DEFINE_GENERAL_4D_REORDER_KERNEL(1x256, r0231, dword, float, 256, TENSOR_REORDER_OCCUPANCY)
+DEFINE_GENERAL_4D_REORDER_KERNEL(1x256, r0312, dword, float, 256, TENSOR_REORDER_OCCUPANCY)
+DEFINE_GENERAL_4D_REORDER_KERNEL(1x256, r0321, dword, float, 256, TENSOR_REORDER_OCCUPANCY)
+DEFINE_GENERAL_4D_REORDER_KERNEL(1x256, r1023, dword, float, 256, TENSOR_REORDER_OCCUPANCY)
+DEFINE_GENERAL_4D_REORDER_KERNEL(1x256, r1032, dword, float, 256, TENSOR_REORDER_OCCUPANCY)
+DEFINE_GENERAL_4D_REORDER_KERNEL(1x256, r1203, dword, float, 256, TENSOR_REORDER_OCCUPANCY)
+DEFINE_GENERAL_4D_REORDER_KERNEL(1x256, r1230, dword, float, 256, TENSOR_REORDER_OCCUPANCY)
+DEFINE_GENERAL_4D_REORDER_KERNEL(1x256, r1302, dword, float, 256, TENSOR_REORDER_OCCUPANCY)
+DEFINE_GENERAL_4D_REORDER_KERNEL(1x256, r1320, dword, float, 256, TENSOR_REORDER_OCCUPANCY)
+DEFINE_GENERAL_4D_REORDER_KERNEL(1x256, r2013, dword, float, 256, TENSOR_REORDER_OCCUPANCY)
+DEFINE_GENERAL_4D_REORDER_KERNEL(1x256, r2031, dword, float, 256, TENSOR_REORDER_OCCUPANCY)
+DEFINE_GENERAL_4D_REORDER_KERNEL(1x256, r2103, dword, float, 256, TENSOR_REORDER_OCCUPANCY)
+DEFINE_GENERAL_4D_REORDER_KERNEL(1x256, r2130, dword, float, 256, TENSOR_REORDER_OCCUPANCY)
+DEFINE_GENERAL_4D_REORDER_KERNEL(1x256, r2301, dword, float, 256, TENSOR_REORDER_OCCUPANCY)
+DEFINE_GENERAL_4D_REORDER_KERNEL(1x256, r2310, dword, float, 256, TENSOR_REORDER_OCCUPANCY)
+DEFINE_GENERAL_4D_REORDER_KERNEL(1x256, r3012, dword, float, 256, TENSOR_REORDER_OCCUPANCY)
+DEFINE_GENERAL_4D_REORDER_KERNEL(1x256, r3021, dword, float, 256, TENSOR_REORDER_OCCUPANCY)
+DEFINE_GENERAL_4D_REORDER_KERNEL(1x256, r3102, dword, float, 256, TENSOR_REORDER_OCCUPANCY)
+DEFINE_GENERAL_4D_REORDER_KERNEL(1x256, r3120, dword, float, 256, TENSOR_REORDER_OCCUPANCY)
+DEFINE_GENERAL_4D_REORDER_KERNEL(1x256, r3201, dword, float, 256, TENSOR_REORDER_OCCUPANCY)
+DEFINE_GENERAL_4D_REORDER_KERNEL(1x256, r3210, dword, float, 256, TENSOR_REORDER_OCCUPANCY)
diff --git a/src/kernels/gpu_general_tensor_reorder_kernel/general_tensor_reorder_1x256_dwordx2.cpp b/src/kernels/gpu_general_tensor_reorder_kernel/general_tensor_reorder_1x256_dwordx2.cpp
new file mode 100644
index 0000000000..fa5ec29887
--- /dev/null
+++ b/src/kernels/gpu_general_tensor_reorder_kernel/general_tensor_reorder_1x256_dwordx2.cpp
@@ -0,0 +1,49 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2020-2022 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+#include "general_tensor_reorder_kernel_util.hpp"
+DEFINE_GENERAL_4D_REORDER_KERNEL(1x256, r0132, dwordx2, double, 256, TENSOR_REORDER_OCCUPANCY)
+DEFINE_GENERAL_4D_REORDER_KERNEL(1x256, r0213, dwordx2, double, 256, TENSOR_REORDER_OCCUPANCY)
+DEFINE_GENERAL_4D_REORDER_KERNEL(1x256, r0231, dwordx2, double, 256, TENSOR_REORDER_OCCUPANCY)
+DEFINE_GENERAL_4D_REORDER_KERNEL(1x256, r0312, dwordx2, double, 256, TENSOR_REORDER_OCCUPANCY)
+DEFINE_GENERAL_4D_REORDER_KERNEL(1x256, r0321, dwordx2, double, 256, TENSOR_REORDER_OCCUPANCY)
+DEFINE_GENERAL_4D_REORDER_KERNEL(1x256, r1023, dwordx2, double, 256, TENSOR_REORDER_OCCUPANCY)
+DEFINE_GENERAL_4D_REORDER_KERNEL(1x256, r1032, dwordx2, double, 256, TENSOR_REORDER_OCCUPANCY)
+DEFINE_GENERAL_4D_REORDER_KERNEL(1x256, r1203, dwordx2, double, 256, TENSOR_REORDER_OCCUPANCY)
+DEFINE_GENERAL_4D_REORDER_KERNEL(1x256, r1230, dwordx2, double, 256, TENSOR_REORDER_OCCUPANCY)
+DEFINE_GENERAL_4D_REORDER_KERNEL(1x256, r1302, dwordx2, double, 256, TENSOR_REORDER_OCCUPANCY)
+DEFINE_GENERAL_4D_REORDER_KERNEL(1x256, r1320, dwordx2, double, 256, TENSOR_REORDER_OCCUPANCY)
+DEFINE_GENERAL_4D_REORDER_KERNEL(1x256, r2013, dwordx2, double, 256, TENSOR_REORDER_OCCUPANCY)
+DEFINE_GENERAL_4D_REORDER_KERNEL(1x256, r2031, dwordx2, double, 256, TENSOR_REORDER_OCCUPANCY)
+DEFINE_GENERAL_4D_REORDER_KERNEL(1x256, r2103, dwordx2, double, 256, TENSOR_REORDER_OCCUPANCY)
+DEFINE_GENERAL_4D_REORDER_KERNEL(1x256, r2130, dwordx2, double, 256, TENSOR_REORDER_OCCUPANCY)
+DEFINE_GENERAL_4D_REORDER_KERNEL(1x256, r2301, dwordx2, double, 256, TENSOR_REORDER_OCCUPANCY)
+DEFINE_GENERAL_4D_REORDER_KERNEL(1x256, r2310, dwordx2, double, 256, TENSOR_REORDER_OCCUPANCY)
+DEFINE_GENERAL_4D_REORDER_KERNEL(1x256, r3012, dwordx2, double, 256, TENSOR_REORDER_OCCUPANCY)
+DEFINE_GENERAL_4D_REORDER_KERNEL(1x256, r3021, dwordx2, double, 256, TENSOR_REORDER_OCCUPANCY)
+DEFINE_GENERAL_4D_REORDER_KERNEL(1x256, r3102, dwordx2, double, 256, TENSOR_REORDER_OCCUPANCY)
+DEFINE_GENERAL_4D_REORDER_KERNEL(1x256, r3120, dwordx2, double, 256, TENSOR_REORDER_OCCUPANCY)
+DEFINE_GENERAL_4D_REORDER_KERNEL(1x256, r3201, dwordx2, double, 256, TENSOR_REORDER_OCCUPANCY)
+DEFINE_GENERAL_4D_REORDER_KERNEL(1x256, r3210, dwordx2, double, 256, TENSOR_REORDER_OCCUPANCY)
diff --git a/src/kernels/gpu_general_tensor_reorder_kernel/general_tensor_reorder_1x256_half.cpp b/src/kernels/gpu_general_tensor_reorder_kernel/general_tensor_reorder_1x256_half.cpp
new file mode 100644
index 0000000000..f7cc9315a0
--- /dev/null
+++ b/src/kernels/gpu_general_tensor_reorder_kernel/general_tensor_reorder_1x256_half.cpp
@@ -0,0 +1,49 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2020-2022 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+#include "general_tensor_reorder_kernel_util.hpp"
+DEFINE_GENERAL_4D_REORDER_KERNEL(1x256, r0132, half, ushort, 256, TENSOR_REORDER_OCCUPANCY)
+DEFINE_GENERAL_4D_REORDER_KERNEL(1x256, r0213, half, ushort, 256, TENSOR_REORDER_OCCUPANCY)
+DEFINE_GENERAL_4D_REORDER_KERNEL(1x256, r0231, half, ushort, 256, TENSOR_REORDER_OCCUPANCY)
+DEFINE_GENERAL_4D_REORDER_KERNEL(1x256, r0312, half, ushort, 256, TENSOR_REORDER_OCCUPANCY)
+DEFINE_GENERAL_4D_REORDER_KERNEL(1x256, r0321, half, ushort, 256, TENSOR_REORDER_OCCUPANCY)
+DEFINE_GENERAL_4D_REORDER_KERNEL(1x256, r1023, half, ushort, 256, TENSOR_REORDER_OCCUPANCY)
+DEFINE_GENERAL_4D_REORDER_KERNEL(1x256, r1032, half, ushort, 256, TENSOR_REORDER_OCCUPANCY)
+DEFINE_GENERAL_4D_REORDER_KERNEL(1x256, r1203, half, ushort, 256, TENSOR_REORDER_OCCUPANCY)
+DEFINE_GENERAL_4D_REORDER_KERNEL(1x256, r1230, half, ushort, 256, TENSOR_REORDER_OCCUPANCY)
+DEFINE_GENERAL_4D_REORDER_KERNEL(1x256, r1302, half, ushort, 256, TENSOR_REORDER_OCCUPANCY)
+DEFINE_GENERAL_4D_REORDER_KERNEL(1x256, r1320, half, ushort, 256, TENSOR_REORDER_OCCUPANCY)
+DEFINE_GENERAL_4D_REORDER_KERNEL(1x256, r2013, half, ushort, 256, TENSOR_REORDER_OCCUPANCY)
+DEFINE_GENERAL_4D_REORDER_KERNEL(1x256, r2031, half, ushort, 256, TENSOR_REORDER_OCCUPANCY)
+DEFINE_GENERAL_4D_REORDER_KERNEL(1x256, r2103, half, ushort, 256, TENSOR_REORDER_OCCUPANCY)
+DEFINE_GENERAL_4D_REORDER_KERNEL(1x256, r2130, half, ushort, 256, TENSOR_REORDER_OCCUPANCY)
+DEFINE_GENERAL_4D_REORDER_KERNEL(1x256, r2301, half, ushort, 256, TENSOR_REORDER_OCCUPANCY)
+DEFINE_GENERAL_4D_REORDER_KERNEL(1x256, r2310, half, ushort, 256, TENSOR_REORDER_OCCUPANCY)
+DEFINE_GENERAL_4D_REORDER_KERNEL(1x256, r3012, half, ushort, 256, TENSOR_REORDER_OCCUPANCY)
+DEFINE_GENERAL_4D_REORDER_KERNEL(1x256, r3021, half, ushort, 256, TENSOR_REORDER_OCCUPANCY)
+DEFINE_GENERAL_4D_REORDER_KERNEL(1x256, r3102, half, ushort, 256, TENSOR_REORDER_OCCUPANCY)
+DEFINE_GENERAL_4D_REORDER_KERNEL(1x256, r3120, half, ushort, 256, TENSOR_REORDER_OCCUPANCY)
+DEFINE_GENERAL_4D_REORDER_KERNEL(1x256, r3201, half, ushort, 256, TENSOR_REORDER_OCCUPANCY)
+DEFINE_GENERAL_4D_REORDER_KERNEL(1x256, r3210, half, ushort, 256, TENSOR_REORDER_OCCUPANCY)
diff --git a/src/kernels/gpu_general_tensor_reorder_kernel/general_tensor_reorder_2x256_byte.cpp b/src/kernels/gpu_general_tensor_reorder_kernel/general_tensor_reorder_2x256_byte.cpp
new file mode 100644
index 0000000000..2da1b46c75
--- /dev/null
+++ b/src/kernels/gpu_general_tensor_reorder_kernel/general_tensor_reorder_2x256_byte.cpp
@@ -0,0 +1,49 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2020-2022 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+#include "general_tensor_reorder_kernel_util.hpp"
+DEFINE_GENERAL_4D_REORDER_KERNEL(2x256, r0132, byte, uchar, 256, TENSOR_REORDER_OCCUPANCY)
+DEFINE_GENERAL_4D_REORDER_KERNEL(2x256, r0213, byte, uchar, 256, TENSOR_REORDER_OCCUPANCY)
+DEFINE_GENERAL_4D_REORDER_KERNEL(2x256, r0231, byte, uchar, 256, TENSOR_REORDER_OCCUPANCY)
+DEFINE_GENERAL_4D_REORDER_KERNEL(2x256, r0312, byte, uchar, 256, TENSOR_REORDER_OCCUPANCY)
+DEFINE_GENERAL_4D_REORDER_KERNEL(2x256, r0321, byte, uchar, 256, TENSOR_REORDER_OCCUPANCY)
+DEFINE_GENERAL_4D_REORDER_KERNEL(2x256, r1023, byte, uchar, 256, TENSOR_REORDER_OCCUPANCY)
+DEFINE_GENERAL_4D_REORDER_KERNEL(2x256, r1032, byte, uchar, 256, TENSOR_REORDER_OCCUPANCY)
+DEFINE_GENERAL_4D_REORDER_KERNEL(2x256, r1203, byte, uchar, 256, TENSOR_REORDER_OCCUPANCY)
+DEFINE_GENERAL_4D_REORDER_KERNEL(2x256, r1230, byte, uchar, 256, TENSOR_REORDER_OCCUPANCY)
+DEFINE_GENERAL_4D_REORDER_KERNEL(2x256, r1302, byte, uchar, 256, TENSOR_REORDER_OCCUPANCY)
+DEFINE_GENERAL_4D_REORDER_KERNEL(2x256, r1320, byte, uchar, 256, TENSOR_REORDER_OCCUPANCY)
+DEFINE_GENERAL_4D_REORDER_KERNEL(2x256, r2013, byte, uchar, 256, TENSOR_REORDER_OCCUPANCY)
+DEFINE_GENERAL_4D_REORDER_KERNEL(2x256, r2031, byte, uchar, 256, TENSOR_REORDER_OCCUPANCY)
+DEFINE_GENERAL_4D_REORDER_KERNEL(2x256, r2103, byte, uchar, 256, TENSOR_REORDER_OCCUPANCY)
+DEFINE_GENERAL_4D_REORDER_KERNEL(2x256, r2130, byte, uchar, 256, TENSOR_REORDER_OCCUPANCY)
+DEFINE_GENERAL_4D_REORDER_KERNEL(2x256, r2301, byte, uchar, 256, TENSOR_REORDER_OCCUPANCY)
+DEFINE_GENERAL_4D_REORDER_KERNEL(2x256, r2310, byte, uchar, 256, TENSOR_REORDER_OCCUPANCY)
+DEFINE_GENERAL_4D_REORDER_KERNEL(2x256, r3012, byte, uchar, 256, TENSOR_REORDER_OCCUPANCY)
+DEFINE_GENERAL_4D_REORDER_KERNEL(2x256, r3021, byte, uchar, 256, TENSOR_REORDER_OCCUPANCY)
+DEFINE_GENERAL_4D_REORDER_KERNEL(2x256, r3102, byte, uchar, 256, TENSOR_REORDER_OCCUPANCY)
+DEFINE_GENERAL_4D_REORDER_KERNEL(2x256, r3120, byte, uchar, 256, TENSOR_REORDER_OCCUPANCY)
+DEFINE_GENERAL_4D_REORDER_KERNEL(2x256, r3201, byte, uchar, 256, TENSOR_REORDER_OCCUPANCY)
+DEFINE_GENERAL_4D_REORDER_KERNEL(2x256, r3210, byte, uchar, 256, TENSOR_REORDER_OCCUPANCY)
diff --git a/src/kernels/gpu_general_tensor_reorder_kernel/general_tensor_reorder_2x256_dword.cpp b/src/kernels/gpu_general_tensor_reorder_kernel/general_tensor_reorder_2x256_dword.cpp
new file mode 100644
index 0000000000..b902bc4403
--- /dev/null
+++ b/src/kernels/gpu_general_tensor_reorder_kernel/general_tensor_reorder_2x256_dword.cpp
@@ -0,0 +1,49 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2020-2022 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+#include "general_tensor_reorder_kernel_util.hpp"
+DEFINE_GENERAL_4D_REORDER_KERNEL(2x256, r0132, dword, float, 256, TENSOR_REORDER_OCCUPANCY)
+DEFINE_GENERAL_4D_REORDER_KERNEL(2x256, r0213, dword, float, 256, TENSOR_REORDER_OCCUPANCY)
+DEFINE_GENERAL_4D_REORDER_KERNEL(2x256, r0231, dword, float, 256, TENSOR_REORDER_OCCUPANCY)
+DEFINE_GENERAL_4D_REORDER_KERNEL(2x256, r0312, dword, float, 256, TENSOR_REORDER_OCCUPANCY)
+DEFINE_GENERAL_4D_REORDER_KERNEL(2x256, r0321, dword, float, 256, TENSOR_REORDER_OCCUPANCY)
+DEFINE_GENERAL_4D_REORDER_KERNEL(2x256, r1023, dword, float, 256, TENSOR_REORDER_OCCUPANCY)
+DEFINE_GENERAL_4D_REORDER_KERNEL(2x256, r1032, dword, float, 256, TENSOR_REORDER_OCCUPANCY)
+DEFINE_GENERAL_4D_REORDER_KERNEL(2x256, r1203, dword, float, 256, TENSOR_REORDER_OCCUPANCY)
+DEFINE_GENERAL_4D_REORDER_KERNEL(2x256, r1230, dword, float, 256, TENSOR_REORDER_OCCUPANCY)
+DEFINE_GENERAL_4D_REORDER_KERNEL(2x256, r1302, dword, float, 256, TENSOR_REORDER_OCCUPANCY)
+DEFINE_GENERAL_4D_REORDER_KERNEL(2x256, r1320, dword, float, 256, TENSOR_REORDER_OCCUPANCY)
+DEFINE_GENERAL_4D_REORDER_KERNEL(2x256, r2013, dword, float, 256, TENSOR_REORDER_OCCUPANCY)
+DEFINE_GENERAL_4D_REORDER_KERNEL(2x256, r2031, dword, float, 256, TENSOR_REORDER_OCCUPANCY)
+DEFINE_GENERAL_4D_REORDER_KERNEL(2x256, r2103, dword, float, 256, TENSOR_REORDER_OCCUPANCY)
+DEFINE_GENERAL_4D_REORDER_KERNEL(2x256, r2130, dword, float, 256, TENSOR_REORDER_OCCUPANCY)
+DEFINE_GENERAL_4D_REORDER_KERNEL(2x256, r2301, dword, float, 256, TENSOR_REORDER_OCCUPANCY)
+DEFINE_GENERAL_4D_REORDER_KERNEL(2x256, r2310, dword, float, 256, TENSOR_REORDER_OCCUPANCY)
+DEFINE_GENERAL_4D_REORDER_KERNEL(2x256, r3012, dword, float, 256, TENSOR_REORDER_OCCUPANCY)
+DEFINE_GENERAL_4D_REORDER_KERNEL(2x256, r3021, dword, float, 256, TENSOR_REORDER_OCCUPANCY)
+DEFINE_GENERAL_4D_REORDER_KERNEL(2x256, r3102, dword, float, 256, TENSOR_REORDER_OCCUPANCY)
+DEFINE_GENERAL_4D_REORDER_KERNEL(2x256, r3120, dword, float, 256, TENSOR_REORDER_OCCUPANCY)
+DEFINE_GENERAL_4D_REORDER_KERNEL(2x256, r3201, dword, float, 256, TENSOR_REORDER_OCCUPANCY)
+DEFINE_GENERAL_4D_REORDER_KERNEL(2x256, r3210, dword, float, 256, TENSOR_REORDER_OCCUPANCY)
diff --git a/src/kernels/gpu_general_tensor_reorder_kernel/general_tensor_reorder_2x256_dwordx2.cpp b/src/kernels/gpu_general_tensor_reorder_kernel/general_tensor_reorder_2x256_dwordx2.cpp
new file mode 100644
index 0000000000..58a25a2c00
--- /dev/null
+++ b/src/kernels/gpu_general_tensor_reorder_kernel/general_tensor_reorder_2x256_dwordx2.cpp
@@ -0,0 +1,49 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2020-2022 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+#include "general_tensor_reorder_kernel_util.hpp"
+DEFINE_GENERAL_4D_REORDER_KERNEL(2x256, r0132, dwordx2, double, 256, TENSOR_REORDER_OCCUPANCY)
+DEFINE_GENERAL_4D_REORDER_KERNEL(2x256, r0213, dwordx2, double, 256, TENSOR_REORDER_OCCUPANCY)
+DEFINE_GENERAL_4D_REORDER_KERNEL(2x256, r0231, dwordx2, double, 256, TENSOR_REORDER_OCCUPANCY)
+DEFINE_GENERAL_4D_REORDER_KERNEL(2x256, r0312, dwordx2, double, 256, TENSOR_REORDER_OCCUPANCY)
+DEFINE_GENERAL_4D_REORDER_KERNEL(2x256, r0321, dwordx2, double, 256, TENSOR_REORDER_OCCUPANCY)
+DEFINE_GENERAL_4D_REORDER_KERNEL(2x256, r1023, dwordx2, double, 256, TENSOR_REORDER_OCCUPANCY)
+DEFINE_GENERAL_4D_REORDER_KERNEL(2x256, r1032, dwordx2, double, 256, TENSOR_REORDER_OCCUPANCY)
+DEFINE_GENERAL_4D_REORDER_KERNEL(2x256, r1203, dwordx2, double, 256, TENSOR_REORDER_OCCUPANCY)
+DEFINE_GENERAL_4D_REORDER_KERNEL(2x256, r1230, dwordx2, double, 256, TENSOR_REORDER_OCCUPANCY)
+DEFINE_GENERAL_4D_REORDER_KERNEL(2x256, r1302, dwordx2, double, 256, TENSOR_REORDER_OCCUPANCY)
+DEFINE_GENERAL_4D_REORDER_KERNEL(2x256, r1320, dwordx2, double, 256, TENSOR_REORDER_OCCUPANCY)
+DEFINE_GENERAL_4D_REORDER_KERNEL(2x256, r2013, dwordx2, double, 256, TENSOR_REORDER_OCCUPANCY)
+DEFINE_GENERAL_4D_REORDER_KERNEL(2x256, r2031, dwordx2, double, 256, TENSOR_REORDER_OCCUPANCY)
+DEFINE_GENERAL_4D_REORDER_KERNEL(2x256, r2103, dwordx2, double, 256, TENSOR_REORDER_OCCUPANCY)
+DEFINE_GENERAL_4D_REORDER_KERNEL(2x256, r2130, dwordx2, double, 256, TENSOR_REORDER_OCCUPANCY)
+DEFINE_GENERAL_4D_REORDER_KERNEL(2x256, r2301, dwordx2, double, 256, TENSOR_REORDER_OCCUPANCY)
+DEFINE_GENERAL_4D_REORDER_KERNEL(2x256, r2310, dwordx2, double, 256, TENSOR_REORDER_OCCUPANCY)
+DEFINE_GENERAL_4D_REORDER_KERNEL(2x256, r3012, dwordx2, double, 256, TENSOR_REORDER_OCCUPANCY)
+DEFINE_GENERAL_4D_REORDER_KERNEL(2x256, r3021, dwordx2, double, 256, TENSOR_REORDER_OCCUPANCY)
+DEFINE_GENERAL_4D_REORDER_KERNEL(2x256, r3102, dwordx2, double, 256, TENSOR_REORDER_OCCUPANCY)
+DEFINE_GENERAL_4D_REORDER_KERNEL(2x256, r3120, dwordx2, double, 256, TENSOR_REORDER_OCCUPANCY)
+DEFINE_GENERAL_4D_REORDER_KERNEL(2x256, r3201, dwordx2, double, 256, TENSOR_REORDER_OCCUPANCY)
+DEFINE_GENERAL_4D_REORDER_KERNEL(2x256, r3210, dwordx2, double, 256, TENSOR_REORDER_OCCUPANCY)
diff --git a/src/kernels/gpu_general_tensor_reorder_kernel/general_tensor_reorder_2x256_half.cpp b/src/kernels/gpu_general_tensor_reorder_kernel/general_tensor_reorder_2x256_half.cpp
new file mode 100644
index 0000000000..dcfd36342f
--- /dev/null
+++ b/src/kernels/gpu_general_tensor_reorder_kernel/general_tensor_reorder_2x256_half.cpp
@@ -0,0 +1,49 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2020-2022 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+#include "general_tensor_reorder_kernel_util.hpp"
+DEFINE_GENERAL_4D_REORDER_KERNEL(2x256, r0132, half, ushort, 256, TENSOR_REORDER_OCCUPANCY)
+DEFINE_GENERAL_4D_REORDER_KERNEL(2x256, r0213, half, ushort, 256, TENSOR_REORDER_OCCUPANCY)
+DEFINE_GENERAL_4D_REORDER_KERNEL(2x256, r0231, half, ushort, 256, TENSOR_REORDER_OCCUPANCY)
+DEFINE_GENERAL_4D_REORDER_KERNEL(2x256, r0312, half, ushort, 256, TENSOR_REORDER_OCCUPANCY)
+DEFINE_GENERAL_4D_REORDER_KERNEL(2x256, r0321, half, ushort, 256, TENSOR_REORDER_OCCUPANCY)
+DEFINE_GENERAL_4D_REORDER_KERNEL(2x256, r1023, half, ushort, 256, TENSOR_REORDER_OCCUPANCY)
+DEFINE_GENERAL_4D_REORDER_KERNEL(2x256, r1032, half, ushort, 256, TENSOR_REORDER_OCCUPANCY)
+DEFINE_GENERAL_4D_REORDER_KERNEL(2x256, r1203, half, ushort, 256, TENSOR_REORDER_OCCUPANCY)
+DEFINE_GENERAL_4D_REORDER_KERNEL(2x256, r1230, half, ushort, 256, TENSOR_REORDER_OCCUPANCY)
+DEFINE_GENERAL_4D_REORDER_KERNEL(2x256, r1302, half, ushort, 256, TENSOR_REORDER_OCCUPANCY)
+DEFINE_GENERAL_4D_REORDER_KERNEL(2x256, r1320, half, ushort, 256, TENSOR_REORDER_OCCUPANCY)
+DEFINE_GENERAL_4D_REORDER_KERNEL(2x256, r2013, half, ushort, 256, TENSOR_REORDER_OCCUPANCY)
+DEFINE_GENERAL_4D_REORDER_KERNEL(2x256, r2031, half, ushort, 256, TENSOR_REORDER_OCCUPANCY)
+DEFINE_GENERAL_4D_REORDER_KERNEL(2x256, r2103, half, ushort, 256, TENSOR_REORDER_OCCUPANCY)
+DEFINE_GENERAL_4D_REORDER_KERNEL(2x256, r2130, half, ushort, 256, TENSOR_REORDER_OCCUPANCY)
+DEFINE_GENERAL_4D_REORDER_KERNEL(2x256, r2301, half, ushort, 256, TENSOR_REORDER_OCCUPANCY)
+DEFINE_GENERAL_4D_REORDER_KERNEL(2x256, r2310, half, ushort, 256, TENSOR_REORDER_OCCUPANCY)
+DEFINE_GENERAL_4D_REORDER_KERNEL(2x256, r3012, half, ushort, 256, TENSOR_REORDER_OCCUPANCY)
+DEFINE_GENERAL_4D_REORDER_KERNEL(2x256, r3021, half, ushort, 256, TENSOR_REORDER_OCCUPANCY)
+DEFINE_GENERAL_4D_REORDER_KERNEL(2x256, r3102, half, ushort, 256, TENSOR_REORDER_OCCUPANCY)
+DEFINE_GENERAL_4D_REORDER_KERNEL(2x256, r3120, half, ushort, 256, TENSOR_REORDER_OCCUPANCY)
+DEFINE_GENERAL_4D_REORDER_KERNEL(2x256, r3201, half, ushort, 256, TENSOR_REORDER_OCCUPANCY)
+DEFINE_GENERAL_4D_REORDER_KERNEL(2x256, r3210, half, ushort, 256, TENSOR_REORDER_OCCUPANCY)
diff --git a/src/kernels/gpu_general_tensor_reorder_kernel/general_tensor_reorder_4x256_byte.cpp b/src/kernels/gpu_general_tensor_reorder_kernel/general_tensor_reorder_4x256_byte.cpp
new file mode 100644
index 0000000000..0cc96944a9
--- /dev/null
+++ b/src/kernels/gpu_general_tensor_reorder_kernel/general_tensor_reorder_4x256_byte.cpp
@@ -0,0 +1,49 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2020-2022 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+#include "general_tensor_reorder_kernel_util.hpp"
+DEFINE_GENERAL_4D_REORDER_KERNEL(4x256, r0132, byte, uchar, 256, TENSOR_REORDER_OCCUPANCY)
+DEFINE_GENERAL_4D_REORDER_KERNEL(4x256, r0213, byte, uchar, 256, TENSOR_REORDER_OCCUPANCY)
+DEFINE_GENERAL_4D_REORDER_KERNEL(4x256, r0231, byte, uchar, 256, TENSOR_REORDER_OCCUPANCY)
+DEFINE_GENERAL_4D_REORDER_KERNEL(4x256, r0312, byte, uchar, 256, TENSOR_REORDER_OCCUPANCY)
+DEFINE_GENERAL_4D_REORDER_KERNEL(4x256, r0321, byte, uchar, 256, TENSOR_REORDER_OCCUPANCY)
+DEFINE_GENERAL_4D_REORDER_KERNEL(4x256, r1023, byte, uchar, 256, TENSOR_REORDER_OCCUPANCY)
+DEFINE_GENERAL_4D_REORDER_KERNEL(4x256, r1032, byte, uchar, 256, TENSOR_REORDER_OCCUPANCY)
+DEFINE_GENERAL_4D_REORDER_KERNEL(4x256, r1203, byte, uchar, 256, TENSOR_REORDER_OCCUPANCY)
+DEFINE_GENERAL_4D_REORDER_KERNEL(4x256, r1230, byte, uchar, 256, TENSOR_REORDER_OCCUPANCY)
+DEFINE_GENERAL_4D_REORDER_KERNEL(4x256, r1302, byte, uchar, 256, TENSOR_REORDER_OCCUPANCY)
+DEFINE_GENERAL_4D_REORDER_KERNEL(4x256, r1320, byte, uchar, 256, TENSOR_REORDER_OCCUPANCY)
+DEFINE_GENERAL_4D_REORDER_KERNEL(4x256, r2013, byte, uchar, 256, TENSOR_REORDER_OCCUPANCY)
+DEFINE_GENERAL_4D_REORDER_KERNEL(4x256, r2031, byte, uchar, 256, TENSOR_REORDER_OCCUPANCY)
+DEFINE_GENERAL_4D_REORDER_KERNEL(4x256, r2103, byte, uchar, 256, TENSOR_REORDER_OCCUPANCY)
+DEFINE_GENERAL_4D_REORDER_KERNEL(4x256, r2130, byte, uchar, 256, TENSOR_REORDER_OCCUPANCY)
+DEFINE_GENERAL_4D_REORDER_KERNEL(4x256, r2301, byte, uchar, 256, TENSOR_REORDER_OCCUPANCY)
+DEFINE_GENERAL_4D_REORDER_KERNEL(4x256, r2310, byte, uchar, 256, TENSOR_REORDER_OCCUPANCY)
+DEFINE_GENERAL_4D_REORDER_KERNEL(4x256, r3012, byte, uchar, 256, TENSOR_REORDER_OCCUPANCY)
+DEFINE_GENERAL_4D_REORDER_KERNEL(4x256, r3021, byte, uchar, 256, TENSOR_REORDER_OCCUPANCY)
+DEFINE_GENERAL_4D_REORDER_KERNEL(4x256, r3102, byte, uchar, 256, TENSOR_REORDER_OCCUPANCY)
+DEFINE_GENERAL_4D_REORDER_KERNEL(4x256, r3120, byte, uchar, 256, TENSOR_REORDER_OCCUPANCY)
+DEFINE_GENERAL_4D_REORDER_KERNEL(4x256, r3201, byte, uchar, 256, TENSOR_REORDER_OCCUPANCY)
+DEFINE_GENERAL_4D_REORDER_KERNEL(4x256, r3210, byte, uchar, 256, TENSOR_REORDER_OCCUPANCY)
diff --git a/src/kernels/gpu_general_tensor_reorder_kernel/general_tensor_reorder_4x256_dword.cpp b/src/kernels/gpu_general_tensor_reorder_kernel/general_tensor_reorder_4x256_dword.cpp
new file mode 100644
index 0000000000..a6f92c4a0f
--- /dev/null
+++ b/src/kernels/gpu_general_tensor_reorder_kernel/general_tensor_reorder_4x256_dword.cpp
@@ -0,0 +1,49 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2020-2022 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+#include "general_tensor_reorder_kernel_util.hpp"
+DEFINE_GENERAL_4D_REORDER_KERNEL(4x256, r0132, dword, float, 256, TENSOR_REORDER_OCCUPANCY)
+DEFINE_GENERAL_4D_REORDER_KERNEL(4x256, r0213, dword, float, 256, TENSOR_REORDER_OCCUPANCY)
+DEFINE_GENERAL_4D_REORDER_KERNEL(4x256, r0231, dword, float, 256, TENSOR_REORDER_OCCUPANCY)
+DEFINE_GENERAL_4D_REORDER_KERNEL(4x256, r0312, dword, float, 256, TENSOR_REORDER_OCCUPANCY)
+DEFINE_GENERAL_4D_REORDER_KERNEL(4x256, r0321, dword, float, 256, TENSOR_REORDER_OCCUPANCY)
+DEFINE_GENERAL_4D_REORDER_KERNEL(4x256, r1023, dword, float, 256, TENSOR_REORDER_OCCUPANCY)
+DEFINE_GENERAL_4D_REORDER_KERNEL(4x256, r1032, dword, float, 256, TENSOR_REORDER_OCCUPANCY)
+DEFINE_GENERAL_4D_REORDER_KERNEL(4x256, r1203, dword, float, 256, TENSOR_REORDER_OCCUPANCY)
+DEFINE_GENERAL_4D_REORDER_KERNEL(4x256, r1230, dword, float, 256, TENSOR_REORDER_OCCUPANCY)
+DEFINE_GENERAL_4D_REORDER_KERNEL(4x256, r1302, dword, float, 256, TENSOR_REORDER_OCCUPANCY)
+DEFINE_GENERAL_4D_REORDER_KERNEL(4x256, r1320, dword, float, 256, TENSOR_REORDER_OCCUPANCY)
+DEFINE_GENERAL_4D_REORDER_KERNEL(4x256, r2013, dword, float, 256, TENSOR_REORDER_OCCUPANCY)
+DEFINE_GENERAL_4D_REORDER_KERNEL(4x256, r2031, dword, float, 256, TENSOR_REORDER_OCCUPANCY)
+DEFINE_GENERAL_4D_REORDER_KERNEL(4x256, r2103, dword, float, 256, TENSOR_REORDER_OCCUPANCY)
+DEFINE_GENERAL_4D_REORDER_KERNEL(4x256, r2130, dword, float, 256, TENSOR_REORDER_OCCUPANCY)
+DEFINE_GENERAL_4D_REORDER_KERNEL(4x256, r2301, dword, float, 256, TENSOR_REORDER_OCCUPANCY)
+DEFINE_GENERAL_4D_REORDER_KERNEL(4x256, r2310, dword, float, 256, TENSOR_REORDER_OCCUPANCY)
+DEFINE_GENERAL_4D_REORDER_KERNEL(4x256, r3012, dword, float, 256, TENSOR_REORDER_OCCUPANCY)
+DEFINE_GENERAL_4D_REORDER_KERNEL(4x256, r3021, dword, float, 256, TENSOR_REORDER_OCCUPANCY)
+DEFINE_GENERAL_4D_REORDER_KERNEL(4x256, r3102, dword, float, 256, TENSOR_REORDER_OCCUPANCY)
+DEFINE_GENERAL_4D_REORDER_KERNEL(4x256, r3120, dword, float, 256, TENSOR_REORDER_OCCUPANCY)
+DEFINE_GENERAL_4D_REORDER_KERNEL(4x256, r3201, dword, float, 256, TENSOR_REORDER_OCCUPANCY)
+DEFINE_GENERAL_4D_REORDER_KERNEL(4x256, r3210, dword, float, 256, TENSOR_REORDER_OCCUPANCY)
diff --git a/src/kernels/gpu_general_tensor_reorder_kernel/general_tensor_reorder_4x256_dwordx2.cpp b/src/kernels/gpu_general_tensor_reorder_kernel/general_tensor_reorder_4x256_dwordx2.cpp
new file mode 100644
index 0000000000..42ffd27a72
--- /dev/null
+++ b/src/kernels/gpu_general_tensor_reorder_kernel/general_tensor_reorder_4x256_dwordx2.cpp
@@ -0,0 +1,49 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2020-2022 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+#include "general_tensor_reorder_kernel_util.hpp"
+DEFINE_GENERAL_4D_REORDER_KERNEL(4x256, r0132, dwordx2, double, 256, TENSOR_REORDER_OCCUPANCY)
+DEFINE_GENERAL_4D_REORDER_KERNEL(4x256, r0213, dwordx2, double, 256, TENSOR_REORDER_OCCUPANCY)
+DEFINE_GENERAL_4D_REORDER_KERNEL(4x256, r0231, dwordx2, double, 256, TENSOR_REORDER_OCCUPANCY)
+DEFINE_GENERAL_4D_REORDER_KERNEL(4x256, r0312, dwordx2, double, 256, TENSOR_REORDER_OCCUPANCY)
+DEFINE_GENERAL_4D_REORDER_KERNEL(4x256, r0321, dwordx2, double, 256, TENSOR_REORDER_OCCUPANCY)
+DEFINE_GENERAL_4D_REORDER_KERNEL(4x256, r1023, dwordx2, double, 256, TENSOR_REORDER_OCCUPANCY)
+DEFINE_GENERAL_4D_REORDER_KERNEL(4x256, r1032, dwordx2, double, 256, TENSOR_REORDER_OCCUPANCY)
+DEFINE_GENERAL_4D_REORDER_KERNEL(4x256, r1203, dwordx2, double, 256, TENSOR_REORDER_OCCUPANCY)
+DEFINE_GENERAL_4D_REORDER_KERNEL(4x256, r1230, dwordx2, double, 256, TENSOR_REORDER_OCCUPANCY)
+DEFINE_GENERAL_4D_REORDER_KERNEL(4x256, r1302, dwordx2, double, 256, TENSOR_REORDER_OCCUPANCY)
+DEFINE_GENERAL_4D_REORDER_KERNEL(4x256, r1320, dwordx2, double, 256, TENSOR_REORDER_OCCUPANCY)
+DEFINE_GENERAL_4D_REORDER_KERNEL(4x256, r2013, dwordx2, double, 256, TENSOR_REORDER_OCCUPANCY)
+DEFINE_GENERAL_4D_REORDER_KERNEL(4x256, r2031, dwordx2, double, 256, TENSOR_REORDER_OCCUPANCY)
+DEFINE_GENERAL_4D_REORDER_KERNEL(4x256, r2103, dwordx2, double, 256, TENSOR_REORDER_OCCUPANCY)
+DEFINE_GENERAL_4D_REORDER_KERNEL(4x256, r2130, dwordx2, double, 256, TENSOR_REORDER_OCCUPANCY)
+DEFINE_GENERAL_4D_REORDER_KERNEL(4x256, r2301, dwordx2, double, 256, TENSOR_REORDER_OCCUPANCY)
+DEFINE_GENERAL_4D_REORDER_KERNEL(4x256, r2310, dwordx2, double, 256, TENSOR_REORDER_OCCUPANCY)
+DEFINE_GENERAL_4D_REORDER_KERNEL(4x256, r3012, dwordx2, double, 256, TENSOR_REORDER_OCCUPANCY)
+DEFINE_GENERAL_4D_REORDER_KERNEL(4x256, r3021, dwordx2, double, 256, TENSOR_REORDER_OCCUPANCY)
+DEFINE_GENERAL_4D_REORDER_KERNEL(4x256, r3102, dwordx2, double, 256, TENSOR_REORDER_OCCUPANCY)
+DEFINE_GENERAL_4D_REORDER_KERNEL(4x256, r3120, dwordx2, double, 256, TENSOR_REORDER_OCCUPANCY)
+DEFINE_GENERAL_4D_REORDER_KERNEL(4x256, r3201, dwordx2, double, 256, TENSOR_REORDER_OCCUPANCY)
+DEFINE_GENERAL_4D_REORDER_KERNEL(4x256, r3210, dwordx2, double, 256, TENSOR_REORDER_OCCUPANCY)
diff --git a/src/kernels/gpu_general_tensor_reorder_kernel/general_tensor_reorder_4x256_half.cpp b/src/kernels/gpu_general_tensor_reorder_kernel/general_tensor_reorder_4x256_half.cpp
new file mode 100644
index 0000000000..aebe744280
--- /dev/null
+++ b/src/kernels/gpu_general_tensor_reorder_kernel/general_tensor_reorder_4x256_half.cpp
@@ -0,0 +1,49 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2020-2022 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+#include "general_tensor_reorder_kernel_util.hpp"
+DEFINE_GENERAL_4D_REORDER_KERNEL(4x256, r0132, half, ushort, 256, TENSOR_REORDER_OCCUPANCY)
+DEFINE_GENERAL_4D_REORDER_KERNEL(4x256, r0213, half, ushort, 256, TENSOR_REORDER_OCCUPANCY)
+DEFINE_GENERAL_4D_REORDER_KERNEL(4x256, r0231, half, ushort, 256, TENSOR_REORDER_OCCUPANCY)
+DEFINE_GENERAL_4D_REORDER_KERNEL(4x256, r0312, half, ushort, 256, TENSOR_REORDER_OCCUPANCY)
+DEFINE_GENERAL_4D_REORDER_KERNEL(4x256, r0321, half, ushort, 256, TENSOR_REORDER_OCCUPANCY)
+DEFINE_GENERAL_4D_REORDER_KERNEL(4x256, r1023, half, ushort, 256, TENSOR_REORDER_OCCUPANCY)
+DEFINE_GENERAL_4D_REORDER_KERNEL(4x256, r1032, half, ushort, 256, TENSOR_REORDER_OCCUPANCY)
+DEFINE_GENERAL_4D_REORDER_KERNEL(4x256, r1203, half, ushort, 256, TENSOR_REORDER_OCCUPANCY)
+DEFINE_GENERAL_4D_REORDER_KERNEL(4x256, r1230, half, ushort, 256, TENSOR_REORDER_OCCUPANCY)
+DEFINE_GENERAL_4D_REORDER_KERNEL(4x256, r1302, half, ushort, 256, TENSOR_REORDER_OCCUPANCY)
+DEFINE_GENERAL_4D_REORDER_KERNEL(4x256, r1320, half, ushort, 256, TENSOR_REORDER_OCCUPANCY)
+DEFINE_GENERAL_4D_REORDER_KERNEL(4x256, r2013, half, ushort, 256, TENSOR_REORDER_OCCUPANCY)
+DEFINE_GENERAL_4D_REORDER_KERNEL(4x256, r2031, half, ushort, 256, TENSOR_REORDER_OCCUPANCY)
+DEFINE_GENERAL_4D_REORDER_KERNEL(4x256, r2103, half, ushort, 256, TENSOR_REORDER_OCCUPANCY)
+DEFINE_GENERAL_4D_REORDER_KERNEL(4x256, r2130, half, ushort, 256, TENSOR_REORDER_OCCUPANCY)
+DEFINE_GENERAL_4D_REORDER_KERNEL(4x256, r2301, half, ushort, 256, TENSOR_REORDER_OCCUPANCY)
+DEFINE_GENERAL_4D_REORDER_KERNEL(4x256, r2310, half, ushort, 256, TENSOR_REORDER_OCCUPANCY)
+DEFINE_GENERAL_4D_REORDER_KERNEL(4x256, r3012, half, ushort, 256, TENSOR_REORDER_OCCUPANCY)
+DEFINE_GENERAL_4D_REORDER_KERNEL(4x256, r3021, half, ushort, 256, TENSOR_REORDER_OCCUPANCY)
+DEFINE_GENERAL_4D_REORDER_KERNEL(4x256, r3102, half, ushort, 256, TENSOR_REORDER_OCCUPANCY)
+DEFINE_GENERAL_4D_REORDER_KERNEL(4x256, r3120, half, ushort, 256, TENSOR_REORDER_OCCUPANCY)
+DEFINE_GENERAL_4D_REORDER_KERNEL(4x256, r3201, half, ushort, 256, TENSOR_REORDER_OCCUPANCY)
+DEFINE_GENERAL_4D_REORDER_KERNEL(4x256, r3210, half, ushort, 256, TENSOR_REORDER_OCCUPANCY)
diff --git a/src/kernels/gpu_general_tensor_reorder_kernel/general_tensor_reorder_8x256_byte.cpp b/src/kernels/gpu_general_tensor_reorder_kernel/general_tensor_reorder_8x256_byte.cpp
new file mode 100644
index 0000000000..51b68958f5
--- /dev/null
+++ b/src/kernels/gpu_general_tensor_reorder_kernel/general_tensor_reorder_8x256_byte.cpp
@@ -0,0 +1,49 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2020-2022 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+#include "general_tensor_reorder_kernel_util.hpp"
+DEFINE_GENERAL_4D_REORDER_KERNEL(8x256, r0132, byte, uchar, 256, TENSOR_REORDER_OCCUPANCY)
+DEFINE_GENERAL_4D_REORDER_KERNEL(8x256, r0213, byte, uchar, 256, TENSOR_REORDER_OCCUPANCY)
+DEFINE_GENERAL_4D_REORDER_KERNEL(8x256, r0231, byte, uchar, 256, TENSOR_REORDER_OCCUPANCY)
+DEFINE_GENERAL_4D_REORDER_KERNEL(8x256, r0312, byte, uchar, 256, TENSOR_REORDER_OCCUPANCY)
+DEFINE_GENERAL_4D_REORDER_KERNEL(8x256, r0321, byte, uchar, 256, TENSOR_REORDER_OCCUPANCY)
+DEFINE_GENERAL_4D_REORDER_KERNEL(8x256, r1023, byte, uchar, 256, TENSOR_REORDER_OCCUPANCY)
+DEFINE_GENERAL_4D_REORDER_KERNEL(8x256, r1032, byte, uchar, 256, TENSOR_REORDER_OCCUPANCY)
+DEFINE_GENERAL_4D_REORDER_KERNEL(8x256, r1203, byte, uchar, 256, TENSOR_REORDER_OCCUPANCY)
+DEFINE_GENERAL_4D_REORDER_KERNEL(8x256, r1230, byte, uchar, 256, TENSOR_REORDER_OCCUPANCY)
+DEFINE_GENERAL_4D_REORDER_KERNEL(8x256, r1302, byte, uchar, 256, TENSOR_REORDER_OCCUPANCY)
+DEFINE_GENERAL_4D_REORDER_KERNEL(8x256, r1320, byte, uchar, 256, TENSOR_REORDER_OCCUPANCY)
+DEFINE_GENERAL_4D_REORDER_KERNEL(8x256, r2013, byte, uchar, 256, TENSOR_REORDER_OCCUPANCY)
+DEFINE_GENERAL_4D_REORDER_KERNEL(8x256, r2031, byte, uchar, 256, TENSOR_REORDER_OCCUPANCY)
+DEFINE_GENERAL_4D_REORDER_KERNEL(8x256, r2103, byte, uchar, 256, TENSOR_REORDER_OCCUPANCY)
+DEFINE_GENERAL_4D_REORDER_KERNEL(8x256, r2130, byte, uchar, 256, TENSOR_REORDER_OCCUPANCY)
+DEFINE_GENERAL_4D_REORDER_KERNEL(8x256, r2301, byte, uchar, 256, TENSOR_REORDER_OCCUPANCY)
+DEFINE_GENERAL_4D_REORDER_KERNEL(8x256, r2310, byte, uchar, 256, TENSOR_REORDER_OCCUPANCY)
+DEFINE_GENERAL_4D_REORDER_KERNEL(8x256, r3012, byte, uchar, 256, TENSOR_REORDER_OCCUPANCY)
+DEFINE_GENERAL_4D_REORDER_KERNEL(8x256, r3021, byte, uchar, 256, TENSOR_REORDER_OCCUPANCY)
+DEFINE_GENERAL_4D_REORDER_KERNEL(8x256, r3102, byte, uchar, 256, TENSOR_REORDER_OCCUPANCY)
+DEFINE_GENERAL_4D_REORDER_KERNEL(8x256, r3120, byte, uchar, 256, TENSOR_REORDER_OCCUPANCY)
+DEFINE_GENERAL_4D_REORDER_KERNEL(8x256, r3201, byte, uchar, 256, TENSOR_REORDER_OCCUPANCY)
+DEFINE_GENERAL_4D_REORDER_KERNEL(8x256, r3210, byte, uchar, 256, TENSOR_REORDER_OCCUPANCY)
diff --git a/src/kernels/gpu_general_tensor_reorder_kernel/general_tensor_reorder_8x256_dword.cpp b/src/kernels/gpu_general_tensor_reorder_kernel/general_tensor_reorder_8x256_dword.cpp
new file mode 100644
index 0000000000..f8c7d50084
--- /dev/null
+++ b/src/kernels/gpu_general_tensor_reorder_kernel/general_tensor_reorder_8x256_dword.cpp
@@ -0,0 +1,49 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2020-2022 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+#include "general_tensor_reorder_kernel_util.hpp"
+DEFINE_GENERAL_4D_REORDER_KERNEL(8x256, r0132, dword, float, 256, TENSOR_REORDER_OCCUPANCY)
+DEFINE_GENERAL_4D_REORDER_KERNEL(8x256, r0213, dword, float, 256, TENSOR_REORDER_OCCUPANCY)
+DEFINE_GENERAL_4D_REORDER_KERNEL(8x256, r0231, dword, float, 256, TENSOR_REORDER_OCCUPANCY)
+DEFINE_GENERAL_4D_REORDER_KERNEL(8x256, r0312, dword, float, 256, TENSOR_REORDER_OCCUPANCY)
+DEFINE_GENERAL_4D_REORDER_KERNEL(8x256, r0321, dword, float, 256, TENSOR_REORDER_OCCUPANCY)
+DEFINE_GENERAL_4D_REORDER_KERNEL(8x256, r1023, dword, float, 256, TENSOR_REORDER_OCCUPANCY)
+DEFINE_GENERAL_4D_REORDER_KERNEL(8x256, r1032, dword, float, 256, TENSOR_REORDER_OCCUPANCY)
+DEFINE_GENERAL_4D_REORDER_KERNEL(8x256, r1203, dword, float, 256, TENSOR_REORDER_OCCUPANCY)
+DEFINE_GENERAL_4D_REORDER_KERNEL(8x256, r1230, dword, float, 256, TENSOR_REORDER_OCCUPANCY)
+DEFINE_GENERAL_4D_REORDER_KERNEL(8x256, r1302, dword, float, 256, TENSOR_REORDER_OCCUPANCY)
+DEFINE_GENERAL_4D_REORDER_KERNEL(8x256, r1320, dword, float, 256, TENSOR_REORDER_OCCUPANCY)
+DEFINE_GENERAL_4D_REORDER_KERNEL(8x256, r2013, dword, float, 256, TENSOR_REORDER_OCCUPANCY)
+DEFINE_GENERAL_4D_REORDER_KERNEL(8x256, r2031, dword, float, 256, TENSOR_REORDER_OCCUPANCY)
+DEFINE_GENERAL_4D_REORDER_KERNEL(8x256, r2103, dword, float, 256, TENSOR_REORDER_OCCUPANCY)
+DEFINE_GENERAL_4D_REORDER_KERNEL(8x256, r2130, dword, float, 256, TENSOR_REORDER_OCCUPANCY)
+DEFINE_GENERAL_4D_REORDER_KERNEL(8x256, r2301, dword, float, 256, TENSOR_REORDER_OCCUPANCY)
+DEFINE_GENERAL_4D_REORDER_KERNEL(8x256, r2310, dword, float, 256, TENSOR_REORDER_OCCUPANCY)
+DEFINE_GENERAL_4D_REORDER_KERNEL(8x256, r3012, dword, float, 256, TENSOR_REORDER_OCCUPANCY)
+DEFINE_GENERAL_4D_REORDER_KERNEL(8x256, r3021, dword, float, 256, TENSOR_REORDER_OCCUPANCY)
+DEFINE_GENERAL_4D_REORDER_KERNEL(8x256, r3102, dword, float, 256, TENSOR_REORDER_OCCUPANCY)
+DEFINE_GENERAL_4D_REORDER_KERNEL(8x256, r3120, dword, float, 256, TENSOR_REORDER_OCCUPANCY)
+DEFINE_GENERAL_4D_REORDER_KERNEL(8x256, r3201, dword, float, 256, TENSOR_REORDER_OCCUPANCY)
+DEFINE_GENERAL_4D_REORDER_KERNEL(8x256, r3210, dword, float, 256, TENSOR_REORDER_OCCUPANCY)
diff --git a/src/kernels/gpu_general_tensor_reorder_kernel/general_tensor_reorder_8x256_dwordx2.cpp b/src/kernels/gpu_general_tensor_reorder_kernel/general_tensor_reorder_8x256_dwordx2.cpp
new file mode 100644
index 0000000000..682dd19b6f
--- /dev/null
+++ b/src/kernels/gpu_general_tensor_reorder_kernel/general_tensor_reorder_8x256_dwordx2.cpp
@@ -0,0 +1,49 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2020-2022 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+#include "general_tensor_reorder_kernel_util.hpp"
+DEFINE_GENERAL_4D_REORDER_KERNEL(8x256, r0132, dwordx2, double, 256, TENSOR_REORDER_OCCUPANCY)
+DEFINE_GENERAL_4D_REORDER_KERNEL(8x256, r0213, dwordx2, double, 256, TENSOR_REORDER_OCCUPANCY)
+DEFINE_GENERAL_4D_REORDER_KERNEL(8x256, r0231, dwordx2, double, 256, TENSOR_REORDER_OCCUPANCY)
+DEFINE_GENERAL_4D_REORDER_KERNEL(8x256, r0312, dwordx2, double, 256, TENSOR_REORDER_OCCUPANCY)
+DEFINE_GENERAL_4D_REORDER_KERNEL(8x256, r0321, dwordx2, double, 256, TENSOR_REORDER_OCCUPANCY)
+DEFINE_GENERAL_4D_REORDER_KERNEL(8x256, r1023, dwordx2, double, 256, TENSOR_REORDER_OCCUPANCY)
+DEFINE_GENERAL_4D_REORDER_KERNEL(8x256, r1032, dwordx2, double, 256, TENSOR_REORDER_OCCUPANCY)
+DEFINE_GENERAL_4D_REORDER_KERNEL(8x256, r1203, dwordx2, double, 256, TENSOR_REORDER_OCCUPANCY)
+DEFINE_GENERAL_4D_REORDER_KERNEL(8x256, r1230, dwordx2, double, 256, TENSOR_REORDER_OCCUPANCY)
+DEFINE_GENERAL_4D_REORDER_KERNEL(8x256, r1302, dwordx2, double, 256, TENSOR_REORDER_OCCUPANCY)
+DEFINE_GENERAL_4D_REORDER_KERNEL(8x256, r1320, dwordx2, double, 256, TENSOR_REORDER_OCCUPANCY)
+DEFINE_GENERAL_4D_REORDER_KERNEL(8x256, r2013, dwordx2, double, 256, TENSOR_REORDER_OCCUPANCY)
+DEFINE_GENERAL_4D_REORDER_KERNEL(8x256, r2031, dwordx2, double, 256, TENSOR_REORDER_OCCUPANCY)
+DEFINE_GENERAL_4D_REORDER_KERNEL(8x256, r2103, dwordx2, double, 256, TENSOR_REORDER_OCCUPANCY)
+DEFINE_GENERAL_4D_REORDER_KERNEL(8x256, r2130, dwordx2, double, 256, TENSOR_REORDER_OCCUPANCY)
+DEFINE_GENERAL_4D_REORDER_KERNEL(8x256, r2301, dwordx2, double, 256, TENSOR_REORDER_OCCUPANCY)
+DEFINE_GENERAL_4D_REORDER_KERNEL(8x256, r2310, dwordx2, double, 256, TENSOR_REORDER_OCCUPANCY)
+DEFINE_GENERAL_4D_REORDER_KERNEL(8x256, r3012, dwordx2, double, 256, TENSOR_REORDER_OCCUPANCY)
+DEFINE_GENERAL_4D_REORDER_KERNEL(8x256, r3021, dwordx2, double, 256, TENSOR_REORDER_OCCUPANCY)
+DEFINE_GENERAL_4D_REORDER_KERNEL(8x256, r3102, dwordx2, double, 256, TENSOR_REORDER_OCCUPANCY)
+DEFINE_GENERAL_4D_REORDER_KERNEL(8x256, r3120, dwordx2, double, 256, TENSOR_REORDER_OCCUPANCY)
+DEFINE_GENERAL_4D_REORDER_KERNEL(8x256, r3201, dwordx2, double, 256, TENSOR_REORDER_OCCUPANCY)
+DEFINE_GENERAL_4D_REORDER_KERNEL(8x256, r3210, dwordx2, double, 256, TENSOR_REORDER_OCCUPANCY)
diff --git a/src/kernels/gpu_general_tensor_reorder_kernel/general_tensor_reorder_8x256_half.cpp b/src/kernels/gpu_general_tensor_reorder_kernel/general_tensor_reorder_8x256_half.cpp
new file mode 100644
index 0000000000..c25649db3d
--- /dev/null
+++ b/src/kernels/gpu_general_tensor_reorder_kernel/general_tensor_reorder_8x256_half.cpp
@@ -0,0 +1,49 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2020-2022 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+#include "general_tensor_reorder_kernel_util.hpp"
+DEFINE_GENERAL_4D_REORDER_KERNEL(8x256, r0132, half, ushort, 256, TENSOR_REORDER_OCCUPANCY)
+DEFINE_GENERAL_4D_REORDER_KERNEL(8x256, r0213, half, ushort, 256, TENSOR_REORDER_OCCUPANCY)
+DEFINE_GENERAL_4D_REORDER_KERNEL(8x256, r0231, half, ushort, 256, TENSOR_REORDER_OCCUPANCY)
+DEFINE_GENERAL_4D_REORDER_KERNEL(8x256, r0312, half, ushort, 256, TENSOR_REORDER_OCCUPANCY)
+DEFINE_GENERAL_4D_REORDER_KERNEL(8x256, r0321, half, ushort, 256, TENSOR_REORDER_OCCUPANCY)
+DEFINE_GENERAL_4D_REORDER_KERNEL(8x256, r1023, half, ushort, 256, TENSOR_REORDER_OCCUPANCY)
+DEFINE_GENERAL_4D_REORDER_KERNEL(8x256, r1032, half, ushort, 256, TENSOR_REORDER_OCCUPANCY)
+DEFINE_GENERAL_4D_REORDER_KERNEL(8x256, r1203, half, ushort, 256, TENSOR_REORDER_OCCUPANCY)
+DEFINE_GENERAL_4D_REORDER_KERNEL(8x256, r1230, half, ushort, 256, TENSOR_REORDER_OCCUPANCY)
+DEFINE_GENERAL_4D_REORDER_KERNEL(8x256, r1302, half, ushort, 256, TENSOR_REORDER_OCCUPANCY)
+DEFINE_GENERAL_4D_REORDER_KERNEL(8x256, r1320, half, ushort, 256, TENSOR_REORDER_OCCUPANCY)
+DEFINE_GENERAL_4D_REORDER_KERNEL(8x256, r2013, half, ushort, 256, TENSOR_REORDER_OCCUPANCY)
+DEFINE_GENERAL_4D_REORDER_KERNEL(8x256, r2031, half, ushort, 256, TENSOR_REORDER_OCCUPANCY)
+DEFINE_GENERAL_4D_REORDER_KERNEL(8x256, r2103, half, ushort, 256, TENSOR_REORDER_OCCUPANCY)
+DEFINE_GENERAL_4D_REORDER_KERNEL(8x256, r2130, half, ushort, 256, TENSOR_REORDER_OCCUPANCY)
+DEFINE_GENERAL_4D_REORDER_KERNEL(8x256, r2301, half, ushort, 256, TENSOR_REORDER_OCCUPANCY)
+DEFINE_GENERAL_4D_REORDER_KERNEL(8x256, r2310, half, ushort, 256, TENSOR_REORDER_OCCUPANCY)
+DEFINE_GENERAL_4D_REORDER_KERNEL(8x256, r3012, half, ushort, 256, TENSOR_REORDER_OCCUPANCY)
+DEFINE_GENERAL_4D_REORDER_KERNEL(8x256, r3021, half, ushort, 256, TENSOR_REORDER_OCCUPANCY)
+DEFINE_GENERAL_4D_REORDER_KERNEL(8x256, r3102, half, ushort, 256, TENSOR_REORDER_OCCUPANCY)
+DEFINE_GENERAL_4D_REORDER_KERNEL(8x256, r3120, half, ushort, 256, TENSOR_REORDER_OCCUPANCY)
+DEFINE_GENERAL_4D_REORDER_KERNEL(8x256, r3201, half, ushort, 256, TENSOR_REORDER_OCCUPANCY)
+DEFINE_GENERAL_4D_REORDER_KERNEL(8x256, r3210, half, ushort, 256, TENSOR_REORDER_OCCUPANCY)
diff --git a/src/kernels/gpu_general_tensor_reorder_kernel/general_tensor_reorder_kernel_util.hpp b/src/kernels/gpu_general_tensor_reorder_kernel/general_tensor_reorder_kernel_util.hpp
new file mode 100644
index 0000000000..3568627990
--- /dev/null
+++ b/src/kernels/gpu_general_tensor_reorder_kernel/general_tensor_reorder_kernel_util.hpp
@@ -0,0 +1,428 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2020-2022 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+#ifndef GENERAL_TENSOR_REORDER_UTIL_HPP
+#include <cstdint>
+
+#ifndef MIOPEN_DONT_USE_HIP_RUNTIME_HEADERS
+#include <hip/hip_runtime.h>
+#include <hip/hip_fp16.h>
+#endif
+
+#ifndef TENSOR_REORDER_OCCUPANCY
+#define TENSOR_REORDER_OCCUPANCY 4
+#endif
+#define GENERAL_TENSOR_REORDER_UTIL_HPP
+
+template <int... Is>
+struct order
+{
+    static constexpr std::size_t m_size = sizeof...(Is);
+    // the last dummy element is to prevent compiler complain about empty array, when mSize = 0
+    static constexpr int m_data[m_size + 1] = {Is..., 0};
+
+    __host__ __device__ static constexpr uint64_t size() { return m_size; }
+
+    __host__ __device__ static constexpr uint64_t get_size() { return size(); }
+
+    __host__ __device__ static constexpr int at(int I) { return m_data[I]; }
+};
+
+inline __device__ uint32_t magic_div_u32(const uint32_t& numer,
+                                         const uint32_t& magic,
+                                         const uint32_t& shift)
+{
+    uint32_t tmp = __umulhi(numer, magic);
+    return (tmp + numer) >> shift;
+}
+
+template <typename T, typename dst_order>
+inline __device__ void general_4d_reorder_1x256(T* dst,
+                                                T* src,
+                                                uint32_t dim_0,
+                                                uint32_t dim_1,
+                                                uint32_t dim_2,
+                                                uint32_t dim_3,
+                                                uint32_t dim_stride,
+                                                uint32_t dim_total,
+                                                uint32_t magic_stride0,
+                                                uint32_t shift_stride0,
+                                                uint32_t magic_stride1,
+                                                uint32_t shift_stride1,
+                                                uint32_t magic_stride2,
+                                                uint32_t shift_stride2)
+{
+    constexpr auto dorder = dst_order{};
+    uint32_t pixel_total  = dim_0 * dim_1 * dim_2 * dim_3;
+    uint32_t src_index, dst_index;
+    const uint64_t src_dim[4] = {dim_0, dim_1, dim_2, dim_3};
+    const uint64_t dst_dim[4] = {
+        src_dim[dorder.at(0)], src_dim[dorder.at(1)], src_dim[dorder.at(2)], src_dim[dorder.at(3)]};
+    const uint64_t src_stride[4] = {
+        src_dim[1] * src_dim[2] * src_dim[3], src_dim[2] * src_dim[3], src_dim[3], 1};
+    const uint64_t dst_stride[4] = {
+        dst_dim[1] * dst_dim[2] * dst_dim[3], dst_dim[2] * dst_dim[3], dst_dim[3], 1};
+
+    uint32_t i_src[4] = {0, 0, 0, 0};
+    uint32_t i_dst[4] = {0, 0, 0, 0};
+
+    for(uint32_t dim_id = blockIdx.x; dim_id < dim_total; dim_id += dim_stride)
+    {
+        for(uint32_t k = 0; k < 1; k++)
+        {
+            // unroll k         block          thread
+            src_index = k * dim_total * 256 + dim_id * 256 + threadIdx.x;
+            if(src_index < pixel_total)
+            {
+                i_src[0] = magic_div_u32(src_index, magic_stride0, shift_stride0);
+                i_src[1] = magic_div_u32(
+                    src_index - i_src[0] * src_stride[0], magic_stride1, shift_stride1);
+                i_src[2] =
+                    magic_div_u32(src_index - i_src[0] * src_stride[0] - i_src[1] * src_stride[1],
+                                  magic_stride2,
+                                  shift_stride2);
+                i_src[3] = src_index - i_src[0] * src_stride[0] - i_src[1] * src_stride[1] -
+                           i_src[2] * src_stride[2];
+
+                i_dst[0] = i_src[dorder.at(0)];
+                i_dst[1] = i_src[dorder.at(1)];
+                i_dst[2] = i_src[dorder.at(2)];
+                i_dst[3] = i_src[dorder.at(3)];
+
+                dst_index = i_dst[0] * dst_stride[0] + i_dst[1] * dst_stride[1] +
+                            i_dst[2] * dst_stride[2] + i_dst[3] * dst_stride[3];
+                dst[dst_index] = src[src_index];
+            }
+        }
+    }
+}
+
+template <typename T, typename dst_order>
+inline __device__ void general_4d_reorder_2x256(T* dst,
+                                                T* src,
+                                                uint32_t dim_0,
+                                                uint32_t dim_1,
+                                                uint32_t dim_2,
+                                                uint32_t dim_3,
+                                                uint32_t dim_stride,
+                                                uint32_t dim_total,
+                                                uint32_t magic_stride0,
+                                                uint32_t shift_stride0,
+                                                uint32_t magic_stride1,
+                                                uint32_t shift_stride1,
+                                                uint32_t magic_stride2,
+                                                uint32_t shift_stride2)
+{
+    constexpr auto dorder = dst_order{};
+    uint32_t pixel_total  = dim_0 * dim_1 * dim_2 * dim_3;
+    uint32_t src_index, dst_index;
+    const uint64_t src_dim[4] = {dim_0, dim_1, dim_2, dim_3};
+    const uint64_t dst_dim[4] = {
+        src_dim[dorder.at(0)], src_dim[dorder.at(1)], src_dim[dorder.at(2)], src_dim[dorder.at(3)]};
+    const uint64_t src_stride[4] = {
+        src_dim[1] * src_dim[2] * src_dim[3], src_dim[2] * src_dim[3], src_dim[3], 1};
+    const uint64_t dst_stride[4] = {
+        dst_dim[1] * dst_dim[2] * dst_dim[3], dst_dim[2] * dst_dim[3], dst_dim[3], 1};
+
+    uint32_t i_src[4] = {0, 0, 0, 0};
+    uint32_t i_dst[4] = {0, 0, 0, 0};
+
+    for(uint32_t dim_id = blockIdx.x; dim_id < dim_total; dim_id += dim_stride)
+    {
+        for(uint32_t k = 0; k < 2; k++)
+        {
+            // unroll k         block          thread
+            src_index = k * dim_total * 256 + dim_id * 256 + threadIdx.x;
+            if(src_index < pixel_total)
+            {
+                i_src[0] = magic_div_u32(src_index, magic_stride0, shift_stride0);
+                i_src[1] = magic_div_u32(
+                    src_index - i_src[0] * src_stride[0], magic_stride1, shift_stride1);
+                i_src[2] =
+                    magic_div_u32(src_index - i_src[0] * src_stride[0] - i_src[1] * src_stride[1],
+                                  magic_stride2,
+                                  shift_stride2);
+                i_src[3] = src_index - i_src[0] * src_stride[0] - i_src[1] * src_stride[1] -
+                           i_src[2] * src_stride[2];
+
+                i_dst[0] = i_src[dorder.at(0)];
+                i_dst[1] = i_src[dorder.at(1)];
+                i_dst[2] = i_src[dorder.at(2)];
+                i_dst[3] = i_src[dorder.at(3)];
+
+                dst_index = i_dst[0] * dst_stride[0] + i_dst[1] * dst_stride[1] +
+                            i_dst[2] * dst_stride[2] + i_dst[3] * dst_stride[3];
+                dst[dst_index] = src[src_index];
+            }
+        }
+    }
+}
+
+template <typename T, typename dst_order>
+inline __device__ void general_4d_reorder_4x256(T* dst,
+                                                T* src,
+                                                uint32_t dim_0,
+                                                uint32_t dim_1,
+                                                uint32_t dim_2,
+                                                uint32_t dim_3,
+                                                uint32_t dim_stride,
+                                                uint32_t dim_total,
+                                                uint32_t magic_stride0,
+                                                uint32_t shift_stride0,
+                                                uint32_t magic_stride1,
+                                                uint32_t shift_stride1,
+                                                uint32_t magic_stride2,
+                                                uint32_t shift_stride2)
+{
+    constexpr auto dorder = dst_order{};
+    uint32_t pixel_total  = dim_0 * dim_1 * dim_2 * dim_3;
+    uint32_t src_index, dst_index;
+    const uint64_t src_dim[4] = {dim_0, dim_1, dim_2, dim_3};
+    const uint64_t dst_dim[4] = {
+        src_dim[dorder.at(0)], src_dim[dorder.at(1)], src_dim[dorder.at(2)], src_dim[dorder.at(3)]};
+    const uint64_t src_stride[4] = {
+        src_dim[1] * src_dim[2] * src_dim[3], src_dim[2] * src_dim[3], src_dim[3], 1};
+    const uint64_t dst_stride[4] = {
+        dst_dim[1] * dst_dim[2] * dst_dim[3], dst_dim[2] * dst_dim[3], dst_dim[3], 1};
+
+    uint32_t i_src[4] = {0, 0, 0, 0};
+    uint32_t i_dst[4] = {0, 0, 0, 0};
+
+    for(uint32_t dim_id = blockIdx.x; dim_id < dim_total; dim_id += dim_stride)
+    {
+        for(uint32_t k = 0; k < 4; k++)
+        {
+            // unroll k         block          thread
+            src_index = k * dim_total * 256 + dim_id * 256 + threadIdx.x;
+            if(src_index < pixel_total)
+            {
+                i_src[0] = magic_div_u32(src_index, magic_stride0, shift_stride0);
+                i_src[1] = magic_div_u32(
+                    src_index - i_src[0] * src_stride[0], magic_stride1, shift_stride1);
+                i_src[2] =
+                    magic_div_u32(src_index - i_src[0] * src_stride[0] - i_src[1] * src_stride[1],
+                                  magic_stride2,
+                                  shift_stride2);
+                i_src[3] = src_index - i_src[0] * src_stride[0] - i_src[1] * src_stride[1] -
+                           i_src[2] * src_stride[2];
+
+                i_dst[0] = i_src[dorder.at(0)];
+                i_dst[1] = i_src[dorder.at(1)];
+                i_dst[2] = i_src[dorder.at(2)];
+                i_dst[3] = i_src[dorder.at(3)];
+
+                dst_index = i_dst[0] * dst_stride[0] + i_dst[1] * dst_stride[1] +
+                            i_dst[2] * dst_stride[2] + i_dst[3] * dst_stride[3];
+                dst[dst_index] = src[src_index];
+            }
+        }
+    }
+}
+
+template <typename T, typename dst_order>
+inline __device__ void general_4d_reorder_8x256(T* dst,
+                                                T* src,
+                                                uint32_t dim_0,
+                                                uint32_t dim_1,
+                                                uint32_t dim_2,
+                                                uint32_t dim_3,
+                                                uint32_t dim_stride,
+                                                uint32_t dim_total,
+                                                uint32_t magic_stride0,
+                                                uint32_t shift_stride0,
+                                                uint32_t magic_stride1,
+                                                uint32_t shift_stride1,
+                                                uint32_t magic_stride2,
+                                                uint32_t shift_stride2)
+{
+    constexpr auto dorder = dst_order{};
+    uint32_t pixel_total  = dim_0 * dim_1 * dim_2 * dim_3;
+    uint32_t src_index, dst_index;
+    const uint64_t src_dim[4] = {dim_0, dim_1, dim_2, dim_3};
+    const uint64_t dst_dim[4] = {
+        src_dim[dorder.at(0)], src_dim[dorder.at(1)], src_dim[dorder.at(2)], src_dim[dorder.at(3)]};
+    const uint64_t src_stride[4] = {
+        src_dim[1] * src_dim[2] * src_dim[3], src_dim[2] * src_dim[3], src_dim[3], 1};
+    const uint64_t dst_stride[4] = {
+        dst_dim[1] * dst_dim[2] * dst_dim[3], dst_dim[2] * dst_dim[3], dst_dim[3], 1};
+
+    uint32_t i_src[4] = {0, 0, 0, 0};
+    uint32_t i_dst[4] = {0, 0, 0, 0};
+
+    for(uint32_t dim_id = blockIdx.x; dim_id < dim_total; dim_id += dim_stride)
+    {
+        for(uint32_t k = 0; k < 8; k++)
+        {
+            // unroll k         block          thread
+            src_index = k * dim_total * 256 + dim_id * 256 + threadIdx.x;
+            if(src_index < pixel_total)
+            {
+                i_src[0] = magic_div_u32(src_index, magic_stride0, shift_stride0);
+                i_src[1] = magic_div_u32(
+                    src_index - i_src[0] * src_stride[0], magic_stride1, shift_stride1);
+                i_src[2] =
+                    magic_div_u32(src_index - i_src[0] * src_stride[0] - i_src[1] * src_stride[1],
+                                  magic_stride2,
+                                  shift_stride2);
+                i_src[3] = src_index - i_src[0] * src_stride[0] - i_src[1] * src_stride[1] -
+                           i_src[2] * src_stride[2];
+
+                i_dst[0] = i_src[dorder.at(0)];
+                i_dst[1] = i_src[dorder.at(1)];
+                i_dst[2] = i_src[dorder.at(2)];
+                i_dst[3] = i_src[dorder.at(3)];
+
+                dst_index = i_dst[0] * dst_stride[0] + i_dst[1] * dst_stride[1] +
+                            i_dst[2] * dst_stride[2] + i_dst[3] * dst_stride[3];
+                dst[dst_index] = src[src_index];
+            }
+        }
+    }
+}
+
+template <typename T, typename dst_order>
+inline __device__ void general_4d_reorder_16x256(T* dst,
+                                                 T* src,
+                                                 uint32_t dim_0,
+                                                 uint32_t dim_1,
+                                                 uint32_t dim_2,
+                                                 uint32_t dim_3,
+                                                 uint32_t dim_stride,
+                                                 uint32_t dim_total,
+                                                 uint32_t magic_stride0,
+                                                 uint32_t shift_stride0,
+                                                 uint32_t magic_stride1,
+                                                 uint32_t shift_stride1,
+                                                 uint32_t magic_stride2,
+                                                 uint32_t shift_stride2)
+{
+    constexpr auto dorder = dst_order{};
+    uint32_t pixel_total  = dim_0 * dim_1 * dim_2 * dim_3;
+    uint32_t src_index, dst_index;
+    const uint64_t src_dim[4] = {dim_0, dim_1, dim_2, dim_3};
+    const uint64_t dst_dim[4] = {
+        src_dim[dorder.at(0)], src_dim[dorder.at(1)], src_dim[dorder.at(2)], src_dim[dorder.at(3)]};
+    const uint64_t src_stride[4] = {
+        src_dim[1] * src_dim[2] * src_dim[3], src_dim[2] * src_dim[3], src_dim[3], 1};
+    const uint64_t dst_stride[4] = {
+        dst_dim[1] * dst_dim[2] * dst_dim[3], dst_dim[2] * dst_dim[3], dst_dim[3], 1};
+
+    uint32_t i_src[4] = {0, 0, 0, 0};
+    uint32_t i_dst[4] = {0, 0, 0, 0};
+
+    for(uint32_t dim_id = blockIdx.x; dim_id < dim_total; dim_id += dim_stride)
+    {
+        for(uint32_t k = 0; k < 16; k++)
+        {
+            // unroll k         block          thread
+            src_index = k * dim_total * 256 + dim_id * 256 + threadIdx.x;
+            if(src_index < pixel_total)
+            {
+                i_src[0] = magic_div_u32(src_index, magic_stride0, shift_stride0);
+                i_src[1] = magic_div_u32(
+                    src_index - i_src[0] * src_stride[0], magic_stride1, shift_stride1);
+                i_src[2] =
+                    magic_div_u32(src_index - i_src[0] * src_stride[0] - i_src[1] * src_stride[1],
+                                  magic_stride2,
+                                  shift_stride2);
+                i_src[3] = src_index - i_src[0] * src_stride[0] - i_src[1] * src_stride[1] -
+                           i_src[2] * src_stride[2];
+
+                i_dst[0] = i_src[dorder.at(0)];
+                i_dst[1] = i_src[dorder.at(1)];
+                i_dst[2] = i_src[dorder.at(2)];
+                i_dst[3] = i_src[dorder.at(3)];
+
+                dst_index = i_dst[0] * dst_stride[0] + i_dst[1] * dst_stride[1] +
+                            i_dst[2] * dst_stride[2] + i_dst[3] * dst_stride[3];
+                dst[dst_index] = src[src_index];
+            }
+        }
+    }
+}
+
+#define DEFINE_GENERAL_4D_REORDER_KERNEL(tile_trait,                                               \
+                                         dst_order,                                                \
+                                         accept_data_type,                                         \
+                                         cast_data_type,                                           \
+                                         lb_threads_per_block,                                     \
+                                         lb_blocks_per_cu)                                         \
+    extern "C" __global__ void __launch_bounds__(lb_threads_per_block, lb_blocks_per_cu)           \
+        general_4d_reorder_##tile_trait##_##accept_data_type##_##dst_order(void* dst,              \
+                                                                           void* src,              \
+                                                                           uint32_t dim_0,         \
+                                                                           uint32_t dim_1,         \
+                                                                           uint32_t dim_2,         \
+                                                                           uint32_t dim_3,         \
+                                                                           uint32_t dim_stride,    \
+                                                                           uint32_t dim_total,     \
+                                                                           uint32_t magic_stride0, \
+                                                                           uint32_t shift_stride0, \
+                                                                           uint32_t magic_stride1, \
+                                                                           uint32_t shift_stride1, \
+                                                                           uint32_t magic_stride2, \
+                                                                           uint32_t shift_stride2) \
+    {                                                                                              \
+        general_4d_reorder_##tile_trait<cast_data_type, dst_order>(                                \
+            reinterpret_cast<cast_data_type*>(dst),                                                \
+            reinterpret_cast<cast_data_type*>(src),                                                \
+            dim_0,                                                                                 \
+            dim_1,                                                                                 \
+            dim_2,                                                                                 \
+            dim_3,                                                                                 \
+            dim_stride,                                                                            \
+            dim_total,                                                                             \
+            magic_stride0,                                                                         \
+            shift_stride0,                                                                         \
+            magic_stride1,                                                                         \
+            shift_stride1,                                                                         \
+            magic_stride2,                                                                         \
+            shift_stride2);                                                                        \
+    }
+// default order is 0 1 2 3
+using r0132 = order<0, 1, 3, 2>;
+using r0213 = order<0, 2, 1, 3>; // nhwc2nchwc
+using r0231 = order<0, 2, 3, 1>; // nchw2nchwc
+using r0312 = order<0, 3, 1, 2>; // nhwc2nchw
+using r0321 = order<0, 3, 2, 1>;
+using r1023 = order<1, 0, 2, 3>;
+using r1032 = order<1, 0, 3, 2>;
+using r1203 = order<1, 2, 0, 3>;
+using r1230 = order<1, 2, 3, 0>;
+using r1302 = order<1, 3, 0, 2>; // nchw2chwnc
+using r1320 = order<1, 3, 2, 0>;
+using r2013 = order<2, 0, 1, 3>;
+using r2031 = order<2, 0, 3, 1>;
+using r2103 = order<2, 1, 0, 3>; // nhwc2chwnc
+using r2130 = order<2, 1, 3, 0>;
+using r2301 = order<2, 3, 0, 1>;
+using r2310 = order<2, 3, 1, 0>;
+using r3012 = order<3, 0, 1, 2>;
+using r3021 = order<3, 0, 2, 1>;
+using r3102 = order<3, 1, 0, 2>;
+using r3120 = order<3, 1, 2, 0>;
+using r3201 = order<3, 2, 0, 1>;
+using r3210 = order<3, 2, 1, 0>;
+#endif
diff --git a/src/kernels/gpu_general_tensor_reorder_kernel/order.hpp b/src/kernels/gpu_general_tensor_reorder_kernel/order.hpp
deleted file mode 100644
index c8e80f7e7f..0000000000
--- a/src/kernels/gpu_general_tensor_reorder_kernel/order.hpp
+++ /dev/null
@@ -1,43 +0,0 @@
-/*******************************************************************************
- *
- * MIT License
- *
- * Copyright (c) 2020-2022 Advanced Micro Devices, Inc.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- *
- *******************************************************************************/
-#include <cstdint>
-#ifndef ORDER_HPP
-#define ORDER_HPP
-
-template <int... Is>
-struct order
-{
-    static constexpr std::size_t m_size = sizeof...(Is);
-    // the last dummy element is to prevent compiler complain about empty array, when mSize = 0
-    static constexpr int m_data[m_size + 1] = {Is..., 0};
-
-    __host__ __device__ static constexpr uint64_t size() { return m_size; }
-
-    __host__ __device__ static constexpr uint64_t get_size() { return size(); }
-
-    __host__ __device__ static constexpr int at(int I) { return m_data[I]; }
-};
-#endif
diff --git a/test/tensor_reorder.cpp b/test/tensor_reorder.cpp
index d1e2e1d1ce..452dd1b570 100644
--- a/test/tensor_reorder.cpp
+++ b/test/tensor_reorder.cpp
@@ -314,9 +314,9 @@ struct tensor_reorder_base_driver : test_driver
     }
     ~tensor_reorder_base_driver() { miopenDestroy(handle); }
 
-    static std::vector<uint32_t> get_dim_3_size() { return {1, 9, 14}; }
-    static std::vector<uint32_t> get_dim_2_size() { return {1, 9, 14}; }
-    static std::vector<uint32_t> get_dim_1_size() { return {3, 8, 14}; }
+    static std::vector<uint32_t> get_dim_3_size() { return {1, 9}; }
+    static std::vector<uint32_t> get_dim_2_size() { return {1, 9}; }
+    static std::vector<uint32_t> get_dim_1_size() { return {3, 8}; }
     static std::vector<uint32_t> get_dim_0_size() { return {1, 2}; }
 
     template <typename F>
@@ -504,4 +504,42 @@ struct tensor_reorder_driver : tensor_reorder_base_driver
     }
 };
 
-int main(int argc, const char* argv[]) { test_drive<tensor_reorder_driver>(argc, argv); }
+template <template <class...> class Driver>
+void test_tensor_reorder(int argc, const char* argv[])
+{
+    std::vector<std::string> as(argv + 1, argv + argc);
+    as.emplace_back("--float");
+    for(auto&& arg : as)
+    {
+        if(arg == "--all")
+        {
+            test_drive_impl<Driver<double>>(argv[0], as);
+            test_drive_impl<Driver<float>>(argv[0], as);
+            test_drive_impl<Driver<half_float::half>>(argv[0], as);
+            test_drive_impl<Driver<int8_t>>(argv[0], std::move(as));
+            break;
+        }
+        if(arg == "--double")
+        {
+            test_drive_impl<Driver<double>>(argv[0], std::move(as));
+            break;
+        }
+        if(arg == "--float")
+        {
+            test_drive_impl<Driver<float>>(argv[0], std::move(as));
+            break;
+        }
+        if(arg == "--half")
+        {
+            test_drive_impl<Driver<half_float::half>>(argv[0], std::move(as));
+            break;
+        }
+        if(arg == "--int8")
+        {
+            test_drive_impl<Driver<int8_t>>(argv[0], std::move(as));
+            break;
+        }
+    }
+}
+
+int main(int argc, const char* argv[]) { test_tensor_reorder<tensor_reorder_driver>(argc, argv); }