From fb4590dbcdfb0e8ed0bb03ee03a04b8c3eb3e1be Mon Sep 17 00:00:00 2001 From: Haocong WANG Date: Fri, 8 Apr 2022 05:09:10 +0800 Subject: [PATCH] [Tensor reorder][Quality][#issue 1476] Improve naming style and CTest design (#1481) --- src/conv/invokers/impl_gemm_dynamic.cpp | 12 +- src/hip/batched_transpose_sol.cpp | 8 +- src/hip/general_tensor_reorder_sol.cpp | 65 ++-- src/include/miopen/batched_transpose_sol.hpp | 4 +- .../miopen/general_tensor_reorder_sol.hpp | 28 +- src/include/miopen/tensor_reorder_util.hpp | 321 ++++++++---------- .../general_tensor_reorder.cpp | 2 + .../order.hpp | 3 +- .../conv_asm_implicit_gemm_gtc_bwd_nhwc.cpp | 12 +- .../conv_asm_implicit_gemm_gtc_fwd_nhwc.cpp | 12 +- .../conv_asm_implicit_gemm_gtc_wrw_nhwc.cpp | 18 +- test/gpu_nchw_nhwc_transpose.cpp | 3 +- test/tensor_reorder.cpp | 114 +++---- 13 files changed, 273 insertions(+), 329 deletions(-) diff --git a/src/conv/invokers/impl_gemm_dynamic.cpp b/src/conv/invokers/impl_gemm_dynamic.cpp index abc7334f38..16925610af 100644 --- a/src/conv/invokers/impl_gemm_dynamic.cpp +++ b/src/conv/invokers/impl_gemm_dynamic.cpp @@ -569,9 +569,9 @@ InvokerFactory MakeImplGemmDynamicForwardXdlopsNHWCInvokerFactory( if(!trans_output_skippable) opArgsTrans.emplace_back(trans_output.GetKernelArg()); - trans_input_size = trans_input_skippable ? 0 : trans_input.GetSize(); - trans_weight_size = trans_weight_skippable ? 0 : trans_weight.GetSize(); - trans_output_size = trans_output_skippable ? 0 : trans_output.GetSize(); + trans_input_size = trans_input_skippable ? 0 : trans_input.GetOutputTensorSize(); + trans_weight_size = trans_weight_skippable ? 0 : trans_weight.GetOutputTensorSize(); + trans_output_size = trans_output_skippable ? 0 : trans_output.GetOutputTensorSize(); int idx = 0; if(!trans_input_skippable) @@ -887,9 +887,9 @@ InvokerFactory MakeImplGemmDynamicBackwardDataXdlopsNHWCInvokerFactory( if(!trans_output_skippable) opArgsTrans.emplace_back(trans_output.GetKernelArg()); - trans_input_size = trans_input_skippable ? 0 : trans_input.GetSize(); - trans_weight_size = trans_weight_skippable ? 0 : trans_weight.GetSize(); - trans_output_size = trans_output_skippable ? 0 : trans_output.GetSize(); + trans_input_size = trans_input_skippable ? 0 : trans_input.GetOutputTensorSize(); + trans_weight_size = trans_weight_skippable ? 0 : trans_weight.GetOutputTensorSize(); + trans_output_size = trans_output_skippable ? 0 : trans_output.GetOutputTensorSize(); int idx = 0; if(!trans_input_skippable) diff --git a/src/hip/batched_transpose_sol.cpp b/src/hip/batched_transpose_sol.cpp index 51a6a99359..87e0bd9c8c 100644 --- a/src/hip/batched_transpose_sol.cpp +++ b/src/hip/batched_transpose_sol.cpp @@ -304,7 +304,7 @@ BatchedTransposeSolution::BatchedTransposeSolution(const ExecutionContext& ctx, kernel_param_heuristic = batched_transpose::HeuristicGet(data_size, batch, height, width); } -solver::KernelInfo BatchedTransposeSolution::GetKernel() const +solver::KernelInfo BatchedTransposeSolution::GetKernelInfo() const { std::size_t block_size = BATCHED_TRANSPOSE_BLOCK_SIZE; #if BATCHED_TRANSPOSE_PERSISTENT @@ -327,7 +327,7 @@ solver::KernelInfo BatchedTransposeSolution::GetKernel() const kernel.l_wk.push_back(1); kernel.l_wk.push_back(1); - MIOPEN_LOG_I2("BatchedTransposeSolution use kernel: " + kernel_name); + MIOPEN_LOG_T(kernel_name); return kernel; } @@ -351,6 +351,8 @@ std::vector BatchedTransposeSolution::GetKernelArg() const opArgs.emplace_back(0); // placeholder opArgs.emplace_back(height); opArgs.emplace_back(width); + if(grid_size != static_cast(grid_size)) + MIOPEN_THROW("Variable grid size can't be casted to uint32_t safely"); opArgs.emplace_back(static_cast(grid_size)); opArgs.emplace_back(dim_total); opArgs.emplace_back(magic_h.magic); @@ -374,7 +376,7 @@ bool BatchedTransposeSolution::IsSkippable() const return height == 1 || width == 1; } -size_t BatchedTransposeSolution::GetSize() const +size_t BatchedTransposeSolution::GetOutputTensorSize() const { return miopen::GetTypeSize(data_type) * batch * height * width; } diff --git a/src/hip/general_tensor_reorder_sol.cpp b/src/hip/general_tensor_reorder_sol.cpp index 2b004ca851..2012a574a0 100644 --- a/src/hip/general_tensor_reorder_sol.cpp +++ b/src/hip/general_tensor_reorder_sol.cpp @@ -39,7 +39,7 @@ namespace miopen { namespace tensor_reorder { -static inline std::string GetNameTrait(std::size_t type_size) +static inline std::string GetKernelNameType(std::size_t type_size) { if(type_size == 1) return "byte"; @@ -59,47 +59,45 @@ static inline std::string GetKernelName(std::size_t data_size, uint32_t order_3, const GeneralReorderParam* kparam) { + if(kparam == nullptr) + MIOPEN_THROW("Memory access fault, kparam is a nullptr"); std::ostringstream kernel_name; - std::string type_trait = GetNameTrait(data_size); kernel_name << "general_4d_reorder_" << kparam->tile_x << "x" << kparam->tile_y << "_"; if(!(kparam->pack_x == 1 && kparam->pack_y == 1 && kparam->ediv_x == 1 && kparam->ediv_y == 1)) { kernel_name << "pack_" << kparam->pack_x << "x" << kparam->pack_y << "_ediv_" << kparam->ediv_x << "x" << kparam->ediv_y << "_"; } - kernel_name << type_trait << "_r" << order_0 << order_1 << order_2 << order_3; + kernel_name << GetKernelNameType(data_size) << "_r" << order_0 << order_1 << order_2 << order_3; return kernel_name.str(); } static inline GeneralReorderParam HeuristicGet(std::size_t data_size, uint32_t dim_0, uint32_t dim_1, uint32_t dim_2, uint32_t dim_3) { - /* - * TODO: - * Design a algorithm to determine general tensor reorder tile size. - */ + ///\todo Design a algorithm to determine general tensor reorder tile size. GeneralReorderParam default_kernel; if(data_size <= 8 && dim_0 >= 1 && dim_1 >= 1 && dim_2 >= 1 && dim_3 >= 1) { if(dim_3 >= 16) { - return GeneralReorderParam{16, 256, 1, 1, 1, 1}; + return GeneralReorderParam{16, TENSOR_REORDER_BLOCK_SIZE, 1, 1, 1, 1}; } else if(dim_3 >= 8) { - return GeneralReorderParam{8, 256, 1, 1, 1, 1}; + return GeneralReorderParam{8, TENSOR_REORDER_BLOCK_SIZE, 1, 1, 1, 1}; } else if(dim_3 >= 4) { - return GeneralReorderParam{4, 256, 1, 1, 1, 1}; + return GeneralReorderParam{4, TENSOR_REORDER_BLOCK_SIZE, 1, 1, 1, 1}; } else if(dim_3 >= 2) { - return GeneralReorderParam{2, 256, 1, 1, 1, 1}; + return GeneralReorderParam{2, TENSOR_REORDER_BLOCK_SIZE, 1, 1, 1, 1}; } else { - return GeneralReorderParam{1, 256, 1, 1, 1, 1}; + return GeneralReorderParam{1, TENSOR_REORDER_BLOCK_SIZE, 1, 1, 1, 1}; } } else @@ -109,16 +107,15 @@ HeuristicGet(std::size_t data_size, uint32_t dim_0, uint32_t dim_1, uint32_t dim } } // namespace tensor_reorder -GeneralReorderSolution::GeneralReorderSolution(const ExecutionContext& ctx, - miopenDataType_t data_type_, - uint32_t dim_0_, - uint32_t dim_1_, - uint32_t dim_2_, - uint32_t dim_3_, - uint32_t order_0_, - uint32_t order_1_, - uint32_t order_2_, - uint32_t order_3_) +GenericReorderSolutionImpl::GenericReorderSolutionImpl(miopenDataType_t data_type_, + uint32_t dim_0_, + uint32_t dim_1_, + uint32_t dim_2_, + uint32_t dim_3_, + uint32_t order_0_, + uint32_t order_1_, + uint32_t order_2_, + uint32_t order_3_) : data_type(data_type_), dim_0(dim_0_), dim_1(dim_1_), @@ -131,12 +128,11 @@ GeneralReorderSolution::GeneralReorderSolution(const ExecutionContext& ctx, { if(data_type == miopenInt8x4) MIOPEN_THROW("These data type are not supported"); - num_cu = ctx.GetStream().GetMaxComputeUnits(); std::size_t data_size = miopen::GetTypeSize(data_type); kernel_param_heuristic = tensor_reorder::HeuristicGet(data_size, dim_0, dim_1, dim_2, dim_3); } -solver::KernelInfo GeneralReorderSolution::GetKernel() const +solver::KernelInfo GenericReorderSolutionImpl::GetKernelInfo() const { std::size_t block_size = TENSOR_REORDER_BLOCK_SIZE; uint32_t pixel_total = dim_0 * dim_1 * dim_2 * dim_3; @@ -157,12 +153,12 @@ solver::KernelInfo GeneralReorderSolution::GetKernel() const kernel.l_wk.push_back(1); kernel.l_wk.push_back(1); - MIOPEN_LOG_I2("GeneralReorderSolution use kernel: " + kernel_name); + MIOPEN_LOG_T(kernel_name); return kernel; } -std::vector GeneralReorderSolution::GetKernelArg() const +std::vector GenericReorderSolutionImpl::GetKernelArg() const { std::size_t block_size = TENSOR_REORDER_BLOCK_SIZE; uint32_t pixel_total = dim_0 * dim_1 * dim_2 * dim_3; @@ -181,6 +177,8 @@ std::vector GeneralReorderSolution::GetKernelArg() const opArgs.emplace_back(dim_1); opArgs.emplace_back(dim_2); opArgs.emplace_back(dim_3); + if(grid_size != static_cast(grid_size)) + MIOPEN_THROW("Variable grid size can't be casted to uint32_t safely"); opArgs.emplace_back(static_cast(grid_size)); opArgs.emplace_back(dim_total); opArgs.emplace_back(magic_stride0.magic); @@ -193,20 +191,23 @@ std::vector GeneralReorderSolution::GetKernelArg() const return opArgs; } -std::string GeneralReorderSolution::GetKernelName() const +std::string GenericReorderSolutionImpl::GetKernelName() const { - std::size_t data_size = miopen::GetTypeSize(data_type); - return tensor_reorder::GetKernelName( - data_size, order_0, order_1, order_2, order_3, &kernel_param_heuristic); + return tensor_reorder::GetKernelName(miopen::GetTypeSize(data_type), + order_0, + order_1, + order_2, + order_3, + &kernel_param_heuristic); } -bool GeneralReorderSolution::IsSkippable() const +bool GenericReorderSolutionImpl::IsSkippable() const { // Disable the IsSkippable funciton return dim_0 == 0 || dim_1 == 0 || dim_2 == 0 || dim_3 == 0; } -size_t GeneralReorderSolution::GetSize() const +size_t GenericReorderSolutionImpl::GetOutputTensorSize() const { return miopen::GetTypeSize(data_type) * dim_0 * dim_1 * dim_2 * dim_3; } diff --git a/src/include/miopen/batched_transpose_sol.hpp b/src/include/miopen/batched_transpose_sol.hpp index c912669c63..dedbf4f73e 100644 --- a/src/include/miopen/batched_transpose_sol.hpp +++ b/src/include/miopen/batched_transpose_sol.hpp @@ -51,11 +51,11 @@ struct BatchedTransposeSolution uint32_t batch_, uint32_t height_, uint32_t width_); - solver::KernelInfo GetKernel() const; + solver::KernelInfo GetKernelInfo() const; std::vector GetKernelArg() const; std::string GetKernelName() const; bool IsSkippable() const; - size_t GetSize() const; + size_t GetOutputTensorSize() const; miopenDataType_t data_type; uint32_t batch; diff --git a/src/include/miopen/general_tensor_reorder_sol.hpp b/src/include/miopen/general_tensor_reorder_sol.hpp index 28ce62668d..41f387f995 100644 --- a/src/include/miopen/general_tensor_reorder_sol.hpp +++ b/src/include/miopen/general_tensor_reorder_sol.hpp @@ -26,10 +26,10 @@ #ifndef GUARD_GENERAL_MIOPEN_TENSOR_REORDER_SOL_HPP #define GUARD_GENERAL_MIOPEN_TENSOR_REORDER_SOL_HPP -#include #include #include #include +#include #include namespace miopen { @@ -44,24 +44,23 @@ struct GeneralReorderParam int ediv_y{0}; }; -struct GeneralReorderSolution +struct GenericReorderSolutionImpl { - GeneralReorderSolution(const ExecutionContext& ctx_, - miopenDataType_t data_type_, - uint32_t dim_0_, - uint32_t dim_1_, - uint32_t dim_2_, - uint32_t dim_3_, - uint32_t order_0_, - uint32_t order_1_, - uint32_t order_2_, - uint32_t order_3_); + GenericReorderSolutionImpl(miopenDataType_t data_type_, + uint32_t dim_0_, + uint32_t dim_1_, + uint32_t dim_2_, + uint32_t dim_3_, + uint32_t order_0_, + uint32_t order_1_, + uint32_t order_2_, + uint32_t order_3_); // TODO batched transpose API - solver::KernelInfo GetKernel() const; + solver::KernelInfo GetKernelInfo() const; std::vector GetKernelArg() const; std::string GetKernelName() const; bool IsSkippable() const; - size_t GetSize() const; + size_t GetOutputTensorSize() const; miopenDataType_t data_type; uint32_t dim_0; @@ -72,7 +71,6 @@ struct GeneralReorderSolution uint32_t order_1; uint32_t order_2; uint32_t order_3; - int num_cu; GeneralReorderParam kernel_param_heuristic; }; diff --git a/src/include/miopen/tensor_reorder_util.hpp b/src/include/miopen/tensor_reorder_util.hpp index 2b106625f8..1010b98e95 100644 --- a/src/include/miopen/tensor_reorder_util.hpp +++ b/src/include/miopen/tensor_reorder_util.hpp @@ -28,213 +28,180 @@ #include #include -#include #include #include #include #include namespace miopen { -struct TensorReorderSolution +struct TensorReorderAttributesBase { - virtual ~TensorReorderSolution() = default; - virtual solver::KernelInfo GetKernel() const = 0; + virtual ~TensorReorderAttributesBase() = default; + virtual solver::KernelInfo GetKernelInfo() const = 0; virtual std::vector GetKernelArg() const = 0; virtual std::string GetKernelName() const = 0; - virtual bool IsSkippable() const = 0; - virtual size_t GetSize() const = 0; + // used in HOST side to check the special cases that either tensor height or width equal = 1. + // In such cases, we don't need to conduct batched transpose operation, + // since the transposed tensor layout has exactly same memory layout as before. + virtual bool IsSkippable() const = 0; + // workspace (buffer) to hold output tensor of transform Pre/Post convolution + virtual size_t GetOutputTensorSize() const = 0; }; -struct WrapperBatchedTransposeSolution_0132 : TensorReorderSolution +struct BatchedTransposeSolution_0132 : TensorReorderAttributesBase { - BatchedTransposeSolution m_BatchedTransposeSolution; - WrapperBatchedTransposeSolution_0132(const ExecutionContext& ctx_, - miopenDataType_t data_type_, - uint32_t dim_0_, - uint32_t dim_1_, - uint32_t dim_2_, - uint32_t dim_3_) - : m_BatchedTransposeSolution(ctx_, data_type_, dim_0_ * dim_1_, dim_2_, dim_3_) - { - } - solver::KernelInfo GetKernel() const override { return m_BatchedTransposeSolution.GetKernel(); } - std::vector GetKernelArg() const override - { - return m_BatchedTransposeSolution.GetKernelArg(); - } - std::string GetKernelName() const override - { - return m_BatchedTransposeSolution.GetKernelName(); - } - bool IsSkippable() const override { return m_BatchedTransposeSolution.IsSkippable(); } - size_t GetSize() const override { return m_BatchedTransposeSolution.GetSize(); } -}; - -struct WrapperBatchedTransposeSolution_0231 : TensorReorderSolution -{ - BatchedTransposeSolution m_BatchedTransposeSolution; - WrapperBatchedTransposeSolution_0231(const ExecutionContext& ctx_, - miopenDataType_t data_type_, - uint32_t dim_0_, - uint32_t dim_1_, - uint32_t dim_2_, - uint32_t dim_3_) - : m_BatchedTransposeSolution(ctx_, data_type_, dim_0_, dim_1_, dim_2_ * dim_3_) - { - } - solver::KernelInfo GetKernel() const override { return m_BatchedTransposeSolution.GetKernel(); } - std::vector GetKernelArg() const override - { - return m_BatchedTransposeSolution.GetKernelArg(); - } - std::string GetKernelName() const override + BatchedTransposeSolution impl; + BatchedTransposeSolution_0132(const ExecutionContext& ctx_, + miopenDataType_t data_type_, + uint32_t dim_0_, + uint32_t dim_1_, + uint32_t dim_2_, + uint32_t dim_3_) + : impl(ctx_, data_type_, dim_0_ * dim_1_, dim_2_, dim_3_) { - return m_BatchedTransposeSolution.GetKernelName(); } - bool IsSkippable() const override { return m_BatchedTransposeSolution.IsSkippable(); } - size_t GetSize() const override { return m_BatchedTransposeSolution.GetSize(); } + solver::KernelInfo GetKernelInfo() const override { return impl.GetKernelInfo(); } + std::vector GetKernelArg() const override { return impl.GetKernelArg(); } + std::string GetKernelName() const override { return impl.GetKernelName(); } + bool IsSkippable() const override { return impl.IsSkippable(); } + size_t GetOutputTensorSize() const override { return impl.GetOutputTensorSize(); } }; -struct WrapperBatchedTransposeSolution_0312 : TensorReorderSolution +struct BatchedTransposeSolution_0231 : TensorReorderAttributesBase { - BatchedTransposeSolution m_BatchedTransposeSolution; - WrapperBatchedTransposeSolution_0312(const ExecutionContext& ctx_, - miopenDataType_t data_type_, - uint32_t dim_0_, - uint32_t dim_1_, - uint32_t dim_2_, - uint32_t dim_3_) - : m_BatchedTransposeSolution(ctx_, data_type_, dim_0_, dim_1_ * dim_2_, dim_3_) - { - } - solver::KernelInfo GetKernel() const override { return m_BatchedTransposeSolution.GetKernel(); } - std::vector GetKernelArg() const override - { - return m_BatchedTransposeSolution.GetKernelArg(); - } - std::string GetKernelName() const override + BatchedTransposeSolution impl; + BatchedTransposeSolution_0231(const ExecutionContext& ctx_, + miopenDataType_t data_type_, + uint32_t dim_0_, + uint32_t dim_1_, + uint32_t dim_2_, + uint32_t dim_3_) + : impl(ctx_, data_type_, dim_0_, dim_1_, dim_2_ * dim_3_) { - return m_BatchedTransposeSolution.GetKernelName(); } - bool IsSkippable() const override { return m_BatchedTransposeSolution.IsSkippable(); } - size_t GetSize() const override { return m_BatchedTransposeSolution.GetSize(); } + solver::KernelInfo GetKernelInfo() const override { return impl.GetKernelInfo(); } + std::vector GetKernelArg() const override { return impl.GetKernelArg(); } + std::string GetKernelName() const override { return impl.GetKernelName(); } + bool IsSkippable() const override { return impl.IsSkippable(); } + size_t GetOutputTensorSize() const override { return impl.GetOutputTensorSize(); } }; -struct WrapperBatchedTransposeSolution_2301 : TensorReorderSolution +struct BatchedTransposeSolution_0312 : TensorReorderAttributesBase { - BatchedTransposeSolution m_BatchedTransposeSolution; - WrapperBatchedTransposeSolution_2301(const ExecutionContext& ctx_, - miopenDataType_t data_type_, - uint32_t dim_0_, - uint32_t dim_1_, - uint32_t dim_2_, - uint32_t dim_3_) - : m_BatchedTransposeSolution(ctx_, data_type_, 1, dim_0_ * dim_1_, dim_2_ * dim_3_) - { - } - solver::KernelInfo GetKernel() const override { return m_BatchedTransposeSolution.GetKernel(); } - std::vector GetKernelArg() const override - { - return m_BatchedTransposeSolution.GetKernelArg(); - } - std::string GetKernelName() const override + BatchedTransposeSolution impl; + BatchedTransposeSolution_0312(const ExecutionContext& ctx_, + miopenDataType_t data_type_, + uint32_t dim_0_, + uint32_t dim_1_, + uint32_t dim_2_, + uint32_t dim_3_) + : impl(ctx_, data_type_, dim_0_, dim_1_ * dim_2_, dim_3_) { - return m_BatchedTransposeSolution.GetKernelName(); } - bool IsSkippable() const override { return m_BatchedTransposeSolution.IsSkippable(); } - size_t GetSize() const override { return m_BatchedTransposeSolution.GetSize(); } + solver::KernelInfo GetKernelInfo() const override { return impl.GetKernelInfo(); } + std::vector GetKernelArg() const override { return impl.GetKernelArg(); } + std::string GetKernelName() const override { return impl.GetKernelName(); } + bool IsSkippable() const override { return impl.IsSkippable(); } + size_t GetOutputTensorSize() const override { return impl.GetOutputTensorSize(); } }; -struct WrapperBatchedTransposeSolution_3012 : TensorReorderSolution +struct BatchedTransposeSolution_2301 : TensorReorderAttributesBase { - BatchedTransposeSolution m_BatchedTransposeSolution; - WrapperBatchedTransposeSolution_3012(const ExecutionContext& ctx_, - miopenDataType_t data_type_, - uint32_t dim_0_, - uint32_t dim_1_, - uint32_t dim_2_, - uint32_t dim_3_) - : m_BatchedTransposeSolution(ctx_, data_type_, 1, dim_0_ * dim_1_ * dim_2_, dim_3_) - { - } - solver::KernelInfo GetKernel() const override { return m_BatchedTransposeSolution.GetKernel(); } - std::vector GetKernelArg() const override - { - return m_BatchedTransposeSolution.GetKernelArg(); - } - std::string GetKernelName() const override + BatchedTransposeSolution impl; + BatchedTransposeSolution_2301(const ExecutionContext& ctx_, + miopenDataType_t data_type_, + uint32_t dim_0_, + uint32_t dim_1_, + uint32_t dim_2_, + uint32_t dim_3_) + : impl(ctx_, data_type_, 1, dim_0_ * dim_1_, dim_2_ * dim_3_) { - return m_BatchedTransposeSolution.GetKernelName(); } - bool IsSkippable() const override { return m_BatchedTransposeSolution.IsSkippable(); } - size_t GetSize() const override { return m_BatchedTransposeSolution.GetSize(); } + solver::KernelInfo GetKernelInfo() const override { return impl.GetKernelInfo(); } + std::vector GetKernelArg() const override { return impl.GetKernelArg(); } + std::string GetKernelName() const override { return impl.GetKernelName(); } + bool IsSkippable() const override { return impl.IsSkippable(); } + size_t GetOutputTensorSize() const override { return impl.GetOutputTensorSize(); } }; -struct WrapperGeneralReorderSolution : TensorReorderSolution +struct BatchedTransposeSolution_3012 : TensorReorderAttributesBase { - GeneralReorderSolution m_GeneralReorderSolution; - WrapperGeneralReorderSolution(const ExecutionContext& ctx_, + BatchedTransposeSolution impl; + BatchedTransposeSolution_3012(const ExecutionContext& ctx_, miopenDataType_t data_type_, uint32_t dim_0_, uint32_t dim_1_, uint32_t dim_2_, - uint32_t dim_3_, - uint32_t order_0_, - uint32_t order_1_, - uint32_t order_2_, - uint32_t order_3_) - : m_GeneralReorderSolution(ctx_, - data_type_, - dim_0_, - dim_1_, - dim_2_, - dim_3_, - order_0_, - order_1_, - order_2_, - order_3_) - { - } - solver::KernelInfo GetKernel() const override { return m_GeneralReorderSolution.GetKernel(); } - std::vector GetKernelArg() const override + uint32_t dim_3_) + : impl(ctx_, data_type_, 1, dim_0_ * dim_1_ * dim_2_, dim_3_) { - return m_GeneralReorderSolution.GetKernelArg(); } - std::string GetKernelName() const override { return m_GeneralReorderSolution.GetKernelName(); } - bool IsSkippable() const override { return m_GeneralReorderSolution.IsSkippable(); } - size_t GetSize() const override { return m_GeneralReorderSolution.GetSize(); } + solver::KernelInfo GetKernelInfo() const override { return impl.GetKernelInfo(); } + std::vector GetKernelArg() const override { return impl.GetKernelArg(); } + std::string GetKernelName() const override { return impl.GetKernelName(); } + bool IsSkippable() const override { return impl.IsSkippable(); } + size_t GetOutputTensorSize() const override { return impl.GetOutputTensorSize(); } +}; + +struct GenericReorderSolution : TensorReorderAttributesBase +{ + GenericReorderSolutionImpl impl; + GenericReorderSolution(miopenDataType_t data_type_, + uint32_t dim_0_, + uint32_t dim_1_, + uint32_t dim_2_, + uint32_t dim_3_, + uint32_t order_0_, + uint32_t order_1_, + uint32_t order_2_, + uint32_t order_3_) + : impl(data_type_, dim_0_, dim_1_, dim_2_, dim_3_, order_0_, order_1_, order_2_, order_3_) + { + } + solver::KernelInfo GetKernelInfo() const override { return impl.GetKernelInfo(); } + std::vector GetKernelArg() const override { return impl.GetKernelArg(); } + std::string GetKernelName() const override { return impl.GetKernelName(); } + bool IsSkippable() const override { return impl.IsSkippable(); } + size_t GetOutputTensorSize() const override { return impl.GetOutputTensorSize(); } }; -__inline__ std::unique_ptr -TensorReorderSolutionConstructor(const ExecutionContext& ctx_, - miopenDataType_t data_type_, - uint32_t dim_0_, - uint32_t dim_1_, - uint32_t dim_2_, - uint32_t dim_3_, - uint32_t order_0_, - uint32_t order_1_, - uint32_t order_2_, - uint32_t order_3_) +inline std::unique_ptr +MakeTensorReorderAttributes(const ExecutionContext& ctx_, + miopenDataType_t data_type_, + uint32_t dim_0_, + uint32_t dim_1_, + uint32_t dim_2_, + uint32_t dim_3_, + uint32_t order_0_, + uint32_t order_1_, + uint32_t order_2_, + uint32_t order_3_) { + std::unique_ptr default_ptr; + if(!ctx_.use_hip_kernels) + { + return default_ptr; + } // Default using general reorder + if(data_type_ == miopenBFloat16) + { + MIOPEN_THROW("Unsupported reorder data type"); + } int which = 0; - if((data_type_ != miopenDouble) && (order_0_ == 0) && (order_1_ == 1) && (order_2_ == 3) && - (order_3_ == 2)) - which = 1; - if((data_type_ != miopenDouble) && (order_0_ == 0) && (order_1_ == 2) && (order_2_ == 3) && - (order_3_ == 1)) - which = 2; - if((data_type_ != miopenDouble) && (order_0_ == 0) && (order_1_ == 3) && (order_2_ == 1) && - (order_3_ == 2)) - which = 3; - if((data_type_ != miopenDouble) && (order_0_ == 2) && (order_1_ == 3) && (order_2_ == 0) && - (order_3_ == 1)) - which = 4; - if((data_type_ != miopenDouble) && (order_0_ == 3) && (order_1_ == 0) && (order_2_ == 1) && - (order_3_ == 2)) - which = 5; + if(data_type_ != miopenDouble) + { + if((order_0_ == 0) && (order_1_ == 1) && (order_2_ == 3) && (order_3_ == 2)) + which = 1; + else if((order_0_ == 0) && (order_1_ == 2) && (order_2_ == 3) && (order_3_ == 1)) + which = 2; + else if((order_0_ == 0) && (order_1_ == 3) && (order_2_ == 1) && (order_3_ == 2)) + which = 3; + else if((order_0_ == 2) && (order_1_ == 3) && (order_2_ == 0) && (order_3_ == 1)) + which = 4; + else if((order_0_ == 3) && (order_1_ == 0) && (order_2_ == 1) && (order_3_ == 2)) + which = 5; + } // Order [0, 1, 3, 2], [0, 2, 3, 1], [0, 3, 1, 2], [2, 3, 0, 1], [3, 0, 1, 2] are using batched // transpose kernel to achieve higher performance. Details as following: // reorder to [0, 1, 3, 2] from [0, 1, 2, 3], we can fix layout index [0] and [1], transpose [2, @@ -245,40 +212,32 @@ TensorReorderSolutionConstructor(const ExecutionContext& ctx_, // see [0, 1] and [2, 3] as entities, then transpose [(0, 1), (2, 3)] to [(2, 3), (0, 1)]. // reorder to [3, 0, 1, 2] from [0, 1, 2, 3], we can add a fixed layout index , see [0, 1, 2] as // an entity, then transpose [(0, 1, 2), 3] to [3, (0, 1, 2)]. The reason we have different API - // like WrapperBatchedTransposeSolution_0132 is that we choose different fixed index and two + // like BatchedTransposeSolution_0132 is that we choose different fixed index and two // dimensions which will be transposed. switch(which) { case 0: - return std::make_unique(ctx_, - data_type_, - dim_0_, - dim_1_, - dim_2_, - dim_3_, - order_0_, - order_1_, - order_2_, - order_3_); + return std::make_unique( + data_type_, dim_0_, dim_1_, dim_2_, dim_3_, order_0_, order_1_, order_2_, order_3_); case 1: - return std::make_unique( + return std::make_unique( ctx_, data_type_, dim_0_, dim_1_, dim_2_, dim_3_); case 2: - return std::make_unique( + return std::make_unique( ctx_, data_type_, dim_0_, dim_1_, dim_2_, dim_3_); case 3: - return std::make_unique( + return std::make_unique( ctx_, data_type_, dim_0_, dim_1_, dim_2_, dim_3_); case 4: - return std::make_unique( + return std::make_unique( ctx_, data_type_, dim_0_, dim_1_, dim_2_, dim_3_); case 5: - return std::make_unique( + return std::make_unique( ctx_, data_type_, dim_0_, dim_1_, dim_2_, dim_3_); - default: return nullptr; + default: MIOPEN_THROW("Unsupported reorder sequence"); break; } - return nullptr; + return default_ptr; } } // namespace miopen diff --git a/src/kernels/gpu_general_tensor_reorder_kernel/general_tensor_reorder.cpp b/src/kernels/gpu_general_tensor_reorder_kernel/general_tensor_reorder.cpp index 7bdf24e14b..e8f236c36e 100644 --- a/src/kernels/gpu_general_tensor_reorder_kernel/general_tensor_reorder.cpp +++ b/src/kernels/gpu_general_tensor_reorder_kernel/general_tensor_reorder.cpp @@ -23,7 +23,9 @@ * SOFTWARE. * *******************************************************************************/ +#ifndef MIOPEN_DONT_USE_HIP_RUNTIME_HEADERS #include +#endif #include #include "order.hpp" diff --git a/src/kernels/gpu_general_tensor_reorder_kernel/order.hpp b/src/kernels/gpu_general_tensor_reorder_kernel/order.hpp index cc989da928..c8e80f7e7f 100644 --- a/src/kernels/gpu_general_tensor_reorder_kernel/order.hpp +++ b/src/kernels/gpu_general_tensor_reorder_kernel/order.hpp @@ -23,13 +23,14 @@ * SOFTWARE. * *******************************************************************************/ +#include #ifndef ORDER_HPP #define ORDER_HPP template struct order { - static constexpr uint64_t m_size = sizeof...(Is); + static constexpr std::size_t m_size = sizeof...(Is); // the last dummy element is to prevent compiler complain about empty array, when mSize = 0 static constexpr int m_data[m_size + 1] = {Is..., 0}; diff --git a/src/solver/conv_asm_implicit_gemm_gtc_bwd_nhwc.cpp b/src/solver/conv_asm_implicit_gemm_gtc_bwd_nhwc.cpp index b543ada7aa..66807767ad 100644 --- a/src/solver/conv_asm_implicit_gemm_gtc_bwd_nhwc.cpp +++ b/src/solver/conv_asm_implicit_gemm_gtc_bwd_nhwc.cpp @@ -959,11 +959,11 @@ ConvAsmImplicitGemmGTCDynamicBwdXdlopsNHWC::GetWorkspaceSize(const ConvolutionCo x); // group * k_per_group as batch for weight TransposeSolutionDefault2Nhwc trans_output(ctx, ctx.in_data_type, n, k, ho, wo); if(!trans_input.IsSkippable()) - size_trans_input = trans_input.GetSize(); + size_trans_input = trans_input.GetOutputTensorSize(); if(!trans_weight.IsSkippable()) - size_trans_weight = trans_weight.GetSize(); + size_trans_weight = trans_weight.GetOutputTensorSize(); if(!trans_output.IsSkippable()) - size_trans_output = trans_output.GetSize(); + size_trans_output = trans_output.GetOutputTensorSize(); } if(!ctx.IsFp32()) @@ -1060,19 +1060,19 @@ ConvSolution ConvAsmImplicitGemmGTCDynamicBwdXdlopsNHWC::GetSolution( if(!trans_input.IsSkippable()) { - result.construction_params.push_back(trans_input.GetKernel()); + result.construction_params.push_back(trans_input.GetKernelInfo()); if(miopen::IsLogging(LoggingLevel::Info2)) msg << ", inp trans:" << trans_input.GetKernelName(); } if(!trans_weight.IsSkippable()) { - result.construction_params.push_back(trans_weight.GetKernel()); + result.construction_params.push_back(trans_weight.GetKernelInfo()); if(miopen::IsLogging(LoggingLevel::Info2)) msg << ", wei trans:" << trans_weight.GetKernelName(); } if(!trans_output.IsSkippable()) { - result.construction_params.push_back(trans_output.GetKernel()); + result.construction_params.push_back(trans_output.GetKernelInfo()); if(miopen::IsLogging(LoggingLevel::Info2)) msg << ", out trans:" << trans_output.GetKernelName(); } diff --git a/src/solver/conv_asm_implicit_gemm_gtc_fwd_nhwc.cpp b/src/solver/conv_asm_implicit_gemm_gtc_fwd_nhwc.cpp index c1df1ee205..d89b744917 100644 --- a/src/solver/conv_asm_implicit_gemm_gtc_fwd_nhwc.cpp +++ b/src/solver/conv_asm_implicit_gemm_gtc_fwd_nhwc.cpp @@ -798,11 +798,11 @@ ConvAsmImplicitGemmGTCDynamicFwdXdlopsNHWC::GetWorkspaceSize(const ConvolutionCo TransposeSolutionNhwc2Default trans_output(ctx, ctx.out_data_type, n, k, ho, wo); if(!trans_input.IsSkippable()) - size_trans_input = trans_input.GetSize(); + size_trans_input = trans_input.GetOutputTensorSize(); if(!trans_weight.IsSkippable()) - size_trans_weight = trans_weight.GetSize(); + size_trans_weight = trans_weight.GetOutputTensorSize(); if(!trans_output.IsSkippable()) - size_trans_output = trans_output.GetSize(); + size_trans_output = trans_output.GetOutputTensorSize(); } if(!ctx.IsFp32()) @@ -943,19 +943,19 @@ ConvSolution ConvAsmImplicitGemmGTCDynamicFwdXdlopsNHWC::GetSolution( if(!trans_input.IsSkippable()) { - result.construction_params.push_back(trans_input.GetKernel()); + result.construction_params.push_back(trans_input.GetKernelInfo()); if(miopen::IsLogging(LoggingLevel::Info2)) msg << ", inp trans:" << trans_input.GetKernelName(); } if(!trans_weight.IsSkippable()) { - result.construction_params.push_back(trans_weight.GetKernel()); + result.construction_params.push_back(trans_weight.GetKernelInfo()); if(miopen::IsLogging(LoggingLevel::Info2)) msg << ", wei trans:" << trans_weight.GetKernelName(); } if(!trans_output.IsSkippable()) { - result.construction_params.push_back(trans_output.GetKernel()); + result.construction_params.push_back(trans_output.GetKernelInfo()); if(miopen::IsLogging(LoggingLevel::Info2)) msg << ", out trans:" << trans_output.GetKernelName(); } diff --git a/src/solver/conv_asm_implicit_gemm_gtc_wrw_nhwc.cpp b/src/solver/conv_asm_implicit_gemm_gtc_wrw_nhwc.cpp index 97f8c3465f..e273f8f8ed 100644 --- a/src/solver/conv_asm_implicit_gemm_gtc_wrw_nhwc.cpp +++ b/src/solver/conv_asm_implicit_gemm_gtc_wrw_nhwc.cpp @@ -920,11 +920,11 @@ ConvAsmImplicitGemmGTCDynamicWrwXdlopsNHWC::GetWorkspaceSize(const ConvolutionCo x); // group * k_per_group as batch for weight TransposeSolutionDefault2Nhwc trans_output(ctx, ctx.in_data_type, n, k, ho, wo); if(!trans_input.IsSkippable()) - size_trans_input = trans_input.GetSize(); + size_trans_input = trans_input.GetOutputTensorSize(); if(!trans_weight.IsSkippable()) - size_trans_weight = trans_weight.GetSize(); + size_trans_weight = trans_weight.GetOutputTensorSize(); if(!trans_output.IsSkippable()) - size_trans_output = trans_output.GetSize(); + size_trans_output = trans_output.GetOutputTensorSize(); } @@ -1068,27 +1068,27 @@ ConvSolution ConvAsmImplicitGemmGTCDynamicWrwXdlopsNHWC::GetSolution( trans_output_skippable = trans_output.IsSkippable(); if(!trans_input_skippable){ - result.construction_params.push_back(trans_input.GetKernel()); + result.construction_params.push_back(trans_input.GetKernelInfo()); opArgsTrans.emplace_back(trans_input.GetKernelArg()); if(miopen::IsLogging(LoggingLevel::Info2)) msg << ", inp trans:"< construction_params{transpose_sol.GetKernel()}; + std::vector construction_params{ + transpose_sol.GetKernelInfo()}; const auto invoker = miopen::deref(this->handle).PrepareInvoker(*invoker_factory, construction_params); diff --git a/test/tensor_reorder.cpp b/test/tensor_reorder.cpp index 20707d3d00..d1e2e1d1ce 100644 --- a/test/tensor_reorder.cpp +++ b/test/tensor_reorder.cpp @@ -225,17 +225,23 @@ struct to_miopen_data_type }; template <> -struct to_miopen_data_type +struct to_miopen_data_type { static miopenDataType_t get() { return miopenHalf; } // we actually didn't calculate 16bit float }; template <> -struct to_miopen_data_type +struct to_miopen_data_type { static miopenDataType_t get() { return miopenInt8; } }; +template <> +struct to_miopen_data_type +{ + static miopenDataType_t get() { return miopenBFloat16; } +}; + #define RAND_INTEGER_MAX 120 #define RAND_INTEGER_MIN -88 @@ -280,11 +286,7 @@ bool compare_equal(float r1, float r2) template bool verify_tensor(tensor& t_gpu, tensor& t_cpu) { - if(t_gpu.data.size() != t_cpu.data.size()) - { - MIOPEN_LOG_E("size not equal, should not happen"); - return false; - } + EXPECT(t_gpu.data.size() == t_cpu.data.size()); auto idx = miopen::mismatch_idx(t_gpu.data, t_cpu.data, compare_equal); bool valid_result = idx >= miopen::range_distance(t_cpu); @@ -296,21 +298,21 @@ bool verify_tensor(tensor& t_gpu, tensor& t_cpu) return valid_result; } -struct reorder_base +struct tensor_reorder_base_driver : test_driver { miopenHandle_t handle{}; #if MIOPEN_BACKEND_OPENCL cl_command_queue q{}; #endif - reorder_base() + tensor_reorder_base_driver() { miopenCreate(&handle); #if MIOPEN_BACKEND_OPENCL miopenGetStream(handle, &q); #endif } - ~reorder_base() { miopenDestroy(handle); } + ~tensor_reorder_base_driver() { miopenDestroy(handle); } static std::vector get_dim_3_size() { return {1, 9, 14}; } static std::vector get_dim_2_size() { return {1, 9, 14}; } @@ -366,8 +368,8 @@ struct reorder_invoke_param : public miopen::InvokeParams { } }; -template -struct reorder_test : reorder_base +template +struct tensor_reorder_driver : tensor_reorder_base_driver { void run() { @@ -399,14 +401,31 @@ struct reorder_test : reorder_base tensor t_dst(tensor_len, tensor_strides); tensor t_dst_gpu(tensor_len, tensor_strides); rand_tensor_integer(t_src); + + miopen::ExecutionContext ctx; + ctx.SetStream(&miopen::deref(this->handle)); + ctx.DetectRocm(); + // ctx.SetupFloats(); + auto reorder_sol = MakeTensorReorderAttributes(ctx, + to_miopen_data_type::get(), + dim_0, + dim_1, + dim_2, + dim_3, + order_0, + order_1, + order_2, + order_3); + EXPECT(reorder_sol != nullptr); + size_t workspace = reorder_sol->IsSkippable() ? sizeof(T) * tensor_sz + : reorder_sol->GetOutputTensorSize(); #if MIOPEN_BACKEND_OPENCL cl_context cl_ctx; clGetCommandQueueInfo(q, CL_QUEUE_CONTEXT, sizeof(cl_context), &cl_ctx, nullptr); cl_int status = CL_SUCCESS; cl_mem src_dev = clCreateBuffer(cl_ctx, CL_MEM_READ_WRITE, sizeof(T) * tensor_sz, nullptr, &status); - cl_mem dst_dev = - clCreateBuffer(cl_ctx, CL_MEM_READ_WRITE, sizeof(T) * tensor_sz, nullptr, nullptr); + cl_mem dst_dev = clCreateBuffer(cl_ctx, CL_MEM_READ_WRITE, workspace, nullptr, nullptr); status |= clEnqueueWriteBuffer(q, src_dev, CL_TRUE, @@ -421,29 +440,13 @@ struct reorder_test : reorder_base void* src_dev; void* dst_dev; EXPECT(hipMalloc(&src_dev, sizeof(T) * tensor_sz) == hipSuccess); - EXPECT(hipMalloc(&dst_dev, sizeof(T) * tensor_sz) == hipSuccess); + EXPECT(hipMalloc(&dst_dev, workspace) == hipSuccess); EXPECT(hipMemcpy( src_dev, t_src.data.data(), sizeof(T) * tensor_sz, hipMemcpyHostToDevice) == hipSuccess); #endif - const auto invoke_param = reorder_invoke_param{ DataCast(static_cast(src_dev)), DataCast(dst_dev)}; - - miopen::ExecutionContext ctx; - ctx.SetStream(&miopen::deref(this->handle)); - ctx.DetectRocm(); - // ctx.SetupFloats(); - auto reorder_sol = TensorReorderSolutionConstructor(ctx, - to_miopen_data_type::get(), - dim_0, - dim_1, - dim_2, - dim_3, - order_0, - order_1, - order_2, - order_3); std::vector opArgs = reorder_sol->GetKernelArg(); boost::optional invoker_factory( [=](const std::vector& kernels) mutable { @@ -451,16 +454,14 @@ struct reorder_test : reorder_base const miopen::AnyInvokeParams& primitive_param) mutable { decltype(auto) invoke_params = primitive_param.CastTo(); - const auto k = handle.Run(kernels[0]); - - opArgs[0] = OpKernelArg(invoke_params.dst); - opArgs[1] = OpKernelArg(invoke_params.src); - + opArgs[0] = OpKernelArg(invoke_params.dst); + opArgs[1] = OpKernelArg(invoke_params.src); k(opArgs); }; }); - std::vector construction_params{reorder_sol->GetKernel()}; + std::vector construction_params{ + reorder_sol->GetKernelInfo()}; const auto invoker = miopen::deref(this->handle).PrepareInvoker(*invoker_factory, construction_params); // run gpu @@ -478,21 +479,16 @@ struct reorder_test : reorder_base order_3); #if MIOPEN_BACKEND_OPENCL - status = clEnqueueReadBuffer(q, - dst_dev, - CL_TRUE, - 0, - sizeof(T) * tensor_sz, - t_dst_gpu.data.data(), - 0, - nullptr, - nullptr); + status = clEnqueueReadBuffer( + q, dst_dev, CL_TRUE, 0, workspace, t_dst_gpu.data.data(), 0, nullptr, nullptr); EXPECT(status == CL_SUCCESS); + clReleaseMemObject(dst_dev); + clReleaseMemObject(src_dev); #elif MIOPEN_BACKEND_HIP - EXPECT(hipMemcpy(t_dst_gpu.data.data(), - dst_dev, - sizeof(T) * tensor_sz, - hipMemcpyDeviceToHost) == hipSuccess); + EXPECT(hipMemcpy(t_dst_gpu.data.data(), dst_dev, workspace, hipMemcpyDeviceToHost) == + hipSuccess); + hipFree(dst_dev); + hipFree(src_dev); #endif // we expect excact match, since use integer @@ -502,26 +498,10 @@ struct reorder_test : reorder_base << "dim_0:" << dim_0 << ", dim_1:" << dim_1 << ", dim_2:" << dim_2 << ", dim_3:" << dim_3 << ", valid:" << valid_result << std::endl; EXPECT(valid_result == true); - -#if MIOPEN_BACKEND_OPENCL - clReleaseMemObject(src_dev); - clReleaseMemObject(dst_dev); -#elif MIOPEN_BACKEND_HIP - hipFree(src_dev); - hipFree(dst_dev); -#endif }; iterate_reorder(run_reorder); } }; -int main() -{ - run_test>(); // DOUBLE only support general - // reorder solution, do not - // support batched transpose. - run_test>(); - run_test>(); - run_test>(); -} +int main(int argc, const char* argv[]) { test_drive(argc, argv); }