ROCm · junliume · Apr 7, 2022 · Jan 24, 2022 · Jan 26, 2022 · Jan 27, 2022
@@ -569,9 +569,9 @@ InvokerFactory MakeImplGemmDynamicForwardXdlopsNHWCInvokerFactory(
         if(!trans_output_skippable)
             opArgsTrans.emplace_back(trans_output.GetKernelArg());
 
-        trans_input_size  = trans_input_skippable ? 0 : trans_input.GetSize();
-        trans_weight_size = trans_weight_skippable ? 0 : trans_weight.GetSize();
-        trans_output_size = trans_output_skippable ? 0 : trans_output.GetSize();
+        trans_input_size  = trans_input_skippable ? 0 : trans_input.GetOutputTensorSize();
+        trans_weight_size = trans_weight_skippable ? 0 : trans_weight.GetOutputTensorSize();
+        trans_output_size = trans_output_skippable ? 0 : trans_output.GetOutputTensorSize();
 
         int idx = 0;
         if(!trans_input_skippable)
@@ -887,9 +887,9 @@ InvokerFactory MakeImplGemmDynamicBackwardDataXdlopsNHWCInvokerFactory(
         if(!trans_output_skippable)
             opArgsTrans.emplace_back(trans_output.GetKernelArg());
 
-        trans_input_size  = trans_input_skippable ? 0 : trans_input.GetSize();
-        trans_weight_size = trans_weight_skippable ? 0 : trans_weight.GetSize();
-        trans_output_size = trans_output_skippable ? 0 : trans_output.GetSize();
+        trans_input_size  = trans_input_skippable ? 0 : trans_input.GetOutputTensorSize();
+        trans_weight_size = trans_weight_skippable ? 0 : trans_weight.GetOutputTensorSize();
+        trans_output_size = trans_output_skippable ? 0 : trans_output.GetOutputTensorSize();
 
         int idx = 0;
         if(!trans_input_skippable)

@@ -304,7 +304,7 @@ BatchedTransposeSolution::BatchedTransposeSolution(const ExecutionContext& ctx,
     kernel_param_heuristic = batched_transpose::HeuristicGet(data_size, batch, height, width);
 }
 
-solver::KernelInfo BatchedTransposeSolution::GetKernel() const
+solver::KernelInfo BatchedTransposeSolution::GetKernelInfo() const
 {
     std::size_t block_size = BATCHED_TRANSPOSE_BLOCK_SIZE;
 #if BATCHED_TRANSPOSE_PERSISTENT
@@ -327,7 +327,7 @@ solver::KernelInfo BatchedTransposeSolution::GetKernel() const
     kernel.l_wk.push_back(1);
     kernel.l_wk.push_back(1);
 
-    MIOPEN_LOG_I2("BatchedTransposeSolution use kernel: " + kernel_name);
+    MIOPEN_LOG_T(kernel_name);
 
     return kernel;
 }
@@ -351,6 +351,8 @@ std::vector<OpKernelArg> BatchedTransposeSolution::GetKernelArg() const
     opArgs.emplace_back(0); // placeholder
     opArgs.emplace_back(height);
     opArgs.emplace_back(width);
+    if(grid_size != static_cast<uint32_t>(grid_size))
+        MIOPEN_THROW("Variable grid size can't be casted to uint32_t safely");
     opArgs.emplace_back(static_cast<uint32_t>(grid_size));
     opArgs.emplace_back(dim_total);
     opArgs.emplace_back(magic_h.magic);
@@ -374,7 +376,7 @@ bool BatchedTransposeSolution::IsSkippable() const
     return height == 1 || width == 1;
 }
 
-size_t BatchedTransposeSolution::GetSize() const
+size_t BatchedTransposeSolution::GetOutputTensorSize() const
 {
     return miopen::GetTypeSize(data_type) * batch * height * width;
 }

@@ -39,7 +39,7 @@
 namespace miopen {
 namespace tensor_reorder {
 
-static inline std::string GetNameTrait(std::size_t type_size)
+static inline std::string GetKernelNameType(std::size_t type_size)
 {
     if(type_size == 1)
         return "byte";
@@ -59,47 +59,45 @@ static inline std::string GetKernelName(std::size_t data_size,
                                         uint32_t order_3,
                                         const GeneralReorderParam* kparam)
 {
+    if(kparam == nullptr)
+        MIOPEN_THROW("Memory access fault, kparam is a nullptr");
     std::ostringstream kernel_name;
-    std::string type_trait = GetNameTrait(data_size);
     kernel_name << "general_4d_reorder_" << kparam->tile_x << "x" << kparam->tile_y << "_";
     if(!(kparam->pack_x == 1 && kparam->pack_y == 1 && kparam->ediv_x == 1 && kparam->ediv_y == 1))
     {
         kernel_name << "pack_" << kparam->pack_x << "x" << kparam->pack_y << "_ediv_"
                     << kparam->ediv_x << "x" << kparam->ediv_y << "_";
     }
-    kernel_name << type_trait << "_r" << order_0 << order_1 << order_2 << order_3;
+    kernel_name << GetKernelNameType(data_size) << "_r" << order_0 << order_1 << order_2 << order_3;
     return kernel_name.str();
 }
 
 static inline GeneralReorderParam
 HeuristicGet(std::size_t data_size, uint32_t dim_0, uint32_t dim_1, uint32_t dim_2, uint32_t dim_3)
 {
-    /*
-     * TODO:
-     * Design a algorithm to determine general tensor reorder tile size.
-     */
+    ///\todo Design a algorithm to determine general tensor reorder tile size.
     GeneralReorderParam default_kernel;
     if(data_size <= 8 && dim_0 >= 1 && dim_1 >= 1 && dim_2 >= 1 && dim_3 >= 1)
     {
         if(dim_3 >= 16)
         {
-            return GeneralReorderParam{16, 256, 1, 1, 1, 1};
+            return GeneralReorderParam{16, TENSOR_REORDER_BLOCK_SIZE, 1, 1, 1, 1};
         }
         else if(dim_3 >= 8)
         {
-            return GeneralReorderParam{8, 256, 1, 1, 1, 1};
+            return GeneralReorderParam{8, TENSOR_REORDER_BLOCK_SIZE, 1, 1, 1, 1};
         }
         else if(dim_3 >= 4)
         {
-            return GeneralReorderParam{4, 256, 1, 1, 1, 1};
+            return GeneralReorderParam{4, TENSOR_REORDER_BLOCK_SIZE, 1, 1, 1, 1};
         }
         else if(dim_3 >= 2)
         {
-            return GeneralReorderParam{2, 256, 1, 1, 1, 1};
+            return GeneralReorderParam{2, TENSOR_REORDER_BLOCK_SIZE, 1, 1, 1, 1};
         }
         else
         {
-            return GeneralReorderParam{1, 256, 1, 1, 1, 1};
+            return GeneralReorderParam{1, TENSOR_REORDER_BLOCK_SIZE, 1, 1, 1, 1};
         }
     }
     else
@@ -109,16 +107,15 @@ HeuristicGet(std::size_t data_size, uint32_t dim_0, uint32_t dim_1, uint32_t dim
 }
 
 } // namespace tensor_reorder
-GeneralReorderSolution::GeneralReorderSolution(const ExecutionContext& ctx,
-                                               miopenDataType_t data_type_,
-                                               uint32_t dim_0_,
-                                               uint32_t dim_1_,
-                                               uint32_t dim_2_,
-                                               uint32_t dim_3_,
-                                               uint32_t order_0_,
-                                               uint32_t order_1_,
-                                               uint32_t order_2_,
-                                               uint32_t order_3_)
+GenericReorderSolutionImpl::GenericReorderSolutionImpl(miopenDataType_t data_type_,
+                                                       uint32_t dim_0_,
+                                                       uint32_t dim_1_,
+                                                       uint32_t dim_2_,
+                                                       uint32_t dim_3_,
+                                                       uint32_t order_0_,
+                                                       uint32_t order_1_,
+                                                       uint32_t order_2_,
+                                                       uint32_t order_3_)
     : data_type(data_type_),
       dim_0(dim_0_),
       dim_1(dim_1_),
@@ -131,12 +128,11 @@ GeneralReorderSolution::GeneralReorderSolution(const ExecutionContext& ctx,
 {
     if(data_type == miopenInt8x4)
         MIOPEN_THROW("These data type are not supported");
-    num_cu                 = ctx.GetStream().GetMaxComputeUnits();
     std::size_t data_size  = miopen::GetTypeSize(data_type);
     kernel_param_heuristic = tensor_reorder::HeuristicGet(data_size, dim_0, dim_1, dim_2, dim_3);
 }
 
-solver::KernelInfo GeneralReorderSolution::GetKernel() const
+solver::KernelInfo GenericReorderSolutionImpl::GetKernelInfo() const
 {
     std::size_t block_size = TENSOR_REORDER_BLOCK_SIZE;
     uint32_t pixel_total   = dim_0 * dim_1 * dim_2 * dim_3;
@@ -157,12 +153,12 @@ solver::KernelInfo GeneralReorderSolution::GetKernel() const
     kernel.l_wk.push_back(1);
     kernel.l_wk.push_back(1);
 
-    MIOPEN_LOG_I2("GeneralReorderSolution use kernel: " + kernel_name);
+    MIOPEN_LOG_T(kernel_name);
 
     return kernel;
 }
 
-std::vector<OpKernelArg> GeneralReorderSolution::GetKernelArg() const
+std::vector<OpKernelArg> GenericReorderSolutionImpl::GetKernelArg() const
 {
     std::size_t block_size = TENSOR_REORDER_BLOCK_SIZE;
     uint32_t pixel_total   = dim_0 * dim_1 * dim_2 * dim_3;
@@ -181,6 +177,8 @@ std::vector<OpKernelArg> GeneralReorderSolution::GetKernelArg() const
     opArgs.emplace_back(dim_1);
     opArgs.emplace_back(dim_2);
     opArgs.emplace_back(dim_3);
+    if(grid_size != static_cast<uint32_t>(grid_size))
+        MIOPEN_THROW("Variable grid size can't be casted to uint32_t safely");
     opArgs.emplace_back(static_cast<uint32_t>(grid_size));
     opArgs.emplace_back(dim_total);
     opArgs.emplace_back(magic_stride0.magic);
@@ -193,20 +191,23 @@ std::vector<OpKernelArg> GeneralReorderSolution::GetKernelArg() const
     return opArgs;
 }
 
-std::string GeneralReorderSolution::GetKernelName() const
+std::string GenericReorderSolutionImpl::GetKernelName() const
 {
-    std::size_t data_size = miopen::GetTypeSize(data_type);
-    return tensor_reorder::GetKernelName(
-        data_size, order_0, order_1, order_2, order_3, &kernel_param_heuristic);
+    return tensor_reorder::GetKernelName(miopen::GetTypeSize(data_type),
+                                         order_0,
+                                         order_1,
+                                         order_2,
+                                         order_3,
+                                         &kernel_param_heuristic);
 }
 
-bool GeneralReorderSolution::IsSkippable() const
+bool GenericReorderSolutionImpl::IsSkippable() const
 {
     // Disable the IsSkippable funciton
     return dim_0 == 0 || dim_1 == 0 || dim_2 == 0 || dim_3 == 0;
 }
 
-size_t GeneralReorderSolution::GetSize() const
+size_t GenericReorderSolutionImpl::GetOutputTensorSize() const
 {
     return miopen::GetTypeSize(data_type) * dim_0 * dim_1 * dim_2 * dim_3;
 }

@@ -51,11 +51,11 @@ struct BatchedTransposeSolution
                              uint32_t batch_,
                              uint32_t height_,
                              uint32_t width_);
-    solver::KernelInfo GetKernel() const;
+    solver::KernelInfo GetKernelInfo() const;
     std::vector<OpKernelArg> GetKernelArg() const;
     std::string GetKernelName() const;
     bool IsSkippable() const;
-    size_t GetSize() const;
+    size_t GetOutputTensorSize() const;
 
     miopenDataType_t data_type;
     uint32_t batch;

@@ -26,10 +26,10 @@
 #ifndef GUARD_GENERAL_MIOPEN_TENSOR_REORDER_SOL_HPP
 #define GUARD_GENERAL_MIOPEN_TENSOR_REORDER_SOL_HPP
 
-#include <miopen/miopen.h>
 #include <miopen/kernel_info.hpp>
 #include <miopen/op_kernel_args.hpp>
 #include <miopen/execution_context.hpp>
+#include <cstdlib>
 #include <vector>
 
 namespace miopen {
@@ -44,24 +44,23 @@ struct GeneralReorderParam
     int ediv_y{0};
 };
 
-struct GeneralReorderSolution
+struct GenericReorderSolutionImpl
 {
-    GeneralReorderSolution(const ExecutionContext& ctx_,
-                           miopenDataType_t data_type_,
-                           uint32_t dim_0_,
-                           uint32_t dim_1_,
-                           uint32_t dim_2_,
-                           uint32_t dim_3_,
-                           uint32_t order_0_,
-                           uint32_t order_1_,
-                           uint32_t order_2_,
-                           uint32_t order_3_);
+    GenericReorderSolutionImpl(miopenDataType_t data_type_,
+                               uint32_t dim_0_,
+                               uint32_t dim_1_,
+                               uint32_t dim_2_,
+                               uint32_t dim_3_,
+                               uint32_t order_0_,
+                               uint32_t order_1_,
+                               uint32_t order_2_,
+                               uint32_t order_3_);
     // TODO batched transpose API
-    solver::KernelInfo GetKernel() const;
+    solver::KernelInfo GetKernelInfo() const;
     std::vector<OpKernelArg> GetKernelArg() const;
     std::string GetKernelName() const;
     bool IsSkippable() const;
-    size_t GetSize() const;
+    size_t GetOutputTensorSize() const;
 
     miopenDataType_t data_type;
     uint32_t dim_0;
@@ -72,7 +71,6 @@ struct GeneralReorderSolution
     uint32_t order_1;
     uint32_t order_2;
     uint32_t order_3;
-    int num_cu;
 
     GeneralReorderParam kernel_param_heuristic;
 };