Merge remote-tracking branch 'origin/develop' into igemm_dynamic_v4r1…

…_bwd
ROCm · Jul 26, 2020 · bcc1bfe · bcc1bfe
2 parents 0c02219 + 88e663c
commit bcc1bfe
Show file tree

Hide file tree

Showing 24 changed files with 1,817 additions and 1,134 deletions.
diff --git a/Jenkinsfile b/Jenkinsfile
@@ -430,8 +430,18 @@ pipeline {
 
                 stage('Bfloat16 Hip Release All') {
                     agent{ label rocmnode("vega20") }
+                    environment{
+                        cmd = """
+                            ulimit -c unlimited
+                            rm -rf build
+                            mkdir build
+                            cd build
+                            CXX=/opt/rocm/llvm/bin/clang++ cmake -DMIOPEN_TEST_BFLOAT16=On -DMIOPEN_TEST_ALL=On -DBUILD_DEV=On -DCMAKE_BUILD_TYPE=release -DMIOPEN_GPU_SYNC=On .. 
+                            make -j test_conv2d
+                        """
+                    }
                     steps{
-                        buildJob('hcc', '-DMIOPEN_TEST_BFLOAT16=On -DBUILD_DEV=On -DMIOPEN_TEST_ALL=On -DCMAKE_BUILD_TYPE=release', "", image + "rocm")
+                        buildHipClangJob('/opt/rocm/llvm/bin/clang++', '', "",  image+'-hip-clang', "/usr/local", cmd)
                     }
                 }
 

diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
@@ -220,10 +220,10 @@ set( MIOpen_Source
     solver/conv_hip_implicit_gemm_nonxdlops_common.cpp
     solver/conv_hip_implicit_gemm_bwd_data_v1r1.cpp
     solver/conv_hip_implicit_gemm_bwd_data_v4r1.cpp
-    solver/conv_hip_implicit_gemm_bwd_data_v1r1_xdlops.cpp
-    solver/conv_asm_implicit_gemm_v4r1_dynamic.cpp
     solver/conv_hip_implicit_gemm_bwd_data_v4r1_xdlops.cpp
+    solver/conv_asm_implicit_gemm_v4r1_dynamic.cpp
     solver/conv_hip_implicit_gemm_v4r4_gen_xdlops_fwd_fp32.cpp
+    solver/conv_hip_implicit_gemm_bwd_v1r1_xdlops_nchw_kcyx_nkhw.cpp
     solver/conv_asm_implicit_gemm_bwd_v4r1_dynamic.cpp
     )
 

diff --git a/src/conv/invokers/impl_gemm.cpp b/src/conv/invokers/impl_gemm.cpp
@@ -50,27 +50,45 @@ InvokerFactory MakeImplGemmDataInvokerFactory(const ConvolutionContext& ctx)
                     kernel.GetName() == "gridwise_convolution_backward_data_implicit_gemm_v1r1_ncdhw_kczyx_nkdhw"))
                 // clang-format on
                 {
-                    float zero = 0.f;
-                    TensorDescriptor workspaceDesc(
-                        miopenFloat, tensors.outDesc.GetLengths(), tensors.outDesc.GetStrides());
-                    SetTensor(handle, workspaceDesc, workSpace, &zero);
-                    if(handle.IsProfilingEnabled())
-                        elapsed += handle.GetKernelTime();
+                    const auto y          = tensors.wDesc.GetLengths()[2];
+                    const auto x          = tensors.wDesc.GetLengths()[3];
+                    const auto stride_h   = conv.GetConvStrides()[0];
+                    const auto stride_w   = conv.GetConvStrides()[1];
+                    const auto dilation_h = conv.GetConvDilations()[0];
+                    const auto dilation_w = conv.GetConvDilations()[1];
+
+                    if((stride_h >= dilation_h * (y - 1) + 1) &&
+                       (stride_w >= dilation_w * (x - 1) + 1))
+                    {
+                        kernel(tensors.in, tensors.w, tensors.out);
+                        if(handle.IsProfilingEnabled())
+                            elapsed += handle.GetKernelTime();
+                    }
+                    else
+                    {
+                        float zero = 0.f;
+                        TensorDescriptor workspaceDesc(miopenFloat,
+                                                       tensors.outDesc.GetLengths(),
+                                                       tensors.outDesc.GetStrides());
+                        SetTensor(handle, workspaceDesc, workSpace, &zero);
+                        if(handle.IsProfilingEnabled())
+                            elapsed += handle.GetKernelTime();
 
-                    kernel(tensors.in, tensors.w, workSpace);
-                    if(handle.IsProfilingEnabled())
-                        elapsed += handle.GetKernelTime();
+                        kernel(tensors.in, tensors.w, workSpace);
+                        if(handle.IsProfilingEnabled())
+                            elapsed += handle.GetKernelTime();
 
-                    CastTensor(handle,
-                               &lowp_quant,
-                               workspaceDesc,
-                               workSpace,
-                               tensors.outDesc,
-                               tensors.out,
-                               0,
-                               0);
-                    if(handle.IsProfilingEnabled())
-                        elapsed += handle.GetKernelTime();
+                        CastTensor(handle,
+                                   &lowp_quant,
+                                   workspaceDesc,
+                                   workSpace,
+                                   tensors.outDesc,
+                                   tensors.out,
+                                   0,
+                                   0);
+                        if(handle.IsProfilingEnabled())
+                            elapsed += handle.GetKernelTime();
+                    }
                 }
                 // clang-format off
                 else if(kernel.GetName() == "gridwise_convolution_backward_data_implicit_gemm_v1r1_xdlops_nchw_kcyx_nkhw" ||

diff --git a/src/include/miopen/solver.hpp b/src/include/miopen/solver.hpp
@@ -1007,6 +1007,53 @@ struct PerformanceImplicitGemmForwardV4R4Xdlops
     std::tuple<std::size_t, bool> CalculateLdsNumberOfByte(const ConvolutionContext& ctx) const;
 };
 
+struct PerformanceImplicitGemmBwdV1R1Xdlops : Serializable<PerformanceImplicitGemmBwdV1R1Xdlops>
+{
+    int GemmMPerBlock;
+    int GemmNPerBlock;
+    int GemmKPerBlock;
+    int GemmMPerWave;
+    int GemmNPerWave;
+    int GemmKPack;
+    bool GemmAThreadCopyMoreGemmK;
+    bool GemmBThreadCopyMoreGemmKPack;
+
+    PerformanceImplicitGemmBwdV1R1Xdlops(int, int, int, int, int, int, bool, bool);
+    PerformanceImplicitGemmBwdV1R1Xdlops();
+    PerformanceImplicitGemmBwdV1R1Xdlops(bool) : PerformanceImplicitGemmBwdV1R1Xdlops() {}
+
+    template <class Self, class F>
+    static void Visit(Self&& self, F f)
+    {
+        f(self.GemmMPerBlock, "GemmMPerBlock");
+        f(self.GemmNPerBlock, "GemmNPerBlock");
+        f(self.GemmKPerBlock, "GemmKPerBlock");
+        f(self.GemmMPerWave, "GemmMPerWave");
+        f(self.GemmNPerWave, "GemmNPerWave");
+        f(self.GemmKPack, "GemmKPack");
+        f(self.GemmAThreadCopyMoreGemmK, "GemmAThreadCopyMoreGemmK");
+        f(self.GemmBThreadCopyMoreGemmKPack, "GemmBThreadCopyMoreGemmKPack");
+    }
+
+    bool operator==(const PerformanceImplicitGemmBwdV1R1Xdlops& other) const;
+    std::string ToString() const;
+
+    void EuristicInit(const ConvolutionContext& ctx);
+    bool SetNextValue();
+    bool IsValidValue() const;
+    bool IsValid(const ConvolutionContext& ctx) const;
+    bool IsReallyValid(const ConvolutionContext& ctx) const;
+    bool IsFastToBeUsedForTuning(const ConvolutionContext& ctx) const;
+
+    std::tuple<int, bool> CalculateBlockSize() const;
+    std::tuple<int, bool> CalculateGridSize(const ConvolutionContext& ctx) const;
+    std::tuple<int, int, int, int, int, bool>
+    CalculateGemmABlockCopyPerformanceParameters(const ConvolutionContext& ctx) const;
+    std::tuple<int, int, int, int, int, bool>
+    CalculateGemmBBlockCopyPerformanceParameters(const ConvolutionContext& ctx) const;
+    std::tuple<std::size_t, bool> CalculateLdsNumberOfByte(const ConvolutionContext& ctx) const;
+};
+
 struct ConvHipImplicitGemmV4R4GenFwdXdlops : SolverBase<ConvolutionContext>
 {
     PerformanceImplicitGemmXdlops GetPerformanceConfig(const ConvolutionContext& ctx) const;
@@ -1231,15 +1278,16 @@ struct ConvHipImplicitGemmBwdDataV4R1Xdlops : SolverBase<ConvolutionContext>
 
 struct ConvHipImplicitGemmBwdDataV1R1Xdlops : SolverBase<ConvolutionContext>
 {
-    PerformanceImplicitGemmXdlops GetPerformanceConfig(const ConvolutionContext& ctx) const;
+    static std::tuple<int, int, int, int> CalculateGemmSize(const ConvolutionContext& ctx);
+    PerformanceImplicitGemmBwdV1R1Xdlops GetPerformanceConfig(const ConvolutionContext& ctx) const;
     bool IsValidPerformanceConfig(const ConvolutionContext& ctx,
-                                  const PerformanceImplicitGemmXdlops& c) const;
+                                  const PerformanceImplicitGemmBwdV1R1Xdlops& c) const;
     bool IsApplicable(const ConvolutionContext& ctx) const;
     size_t GetWorkspaceSize(const ConvolutionContext& ctx) const;
     ConvSolution GetSolution(const ConvolutionContext& ctx,
-                             const PerformanceImplicitGemmXdlops& config,
+                             const PerformanceImplicitGemmBwdV1R1Xdlops& config,
                              bool disableConfigOverrideFromEnv = false) const;
-    PerformanceImplicitGemmXdlops Search(const ConvolutionContext&) const;
+    PerformanceImplicitGemmBwdV1R1Xdlops Search(const ConvolutionContext&) const;
     int RunAndMeasureSolution(const miopen::Handle& profile_h,
                               ConstData_t bot_buf,
                               Data_t top_buf,

diff --git a/src/kernels/MIOpenRNNHiddenStateUpdate.cl b/src/kernels/MIOpenRNNHiddenStateUpdate.cl
@@ -2,7 +2,7 @@
  *
  * MIT License
  *
- * Copyright (c) 2019 Advanced Micro Devices, Inc.
+ * Copyright (c) 2020 Advanced Micro Devices, Inc.
  *
  * Permission is hereby granted, free of charge, to any person obtaining a copy
  * of this software and associated documentation files (the "Software"), to deal
@@ -70,6 +70,8 @@
 
 __kernel void LSTMFwdHidUpdate(const global _FLOAT* cx,
                                global _FLOAT* reservespace,
+                               const int hy_h,
+                               const int hy_stride,
                                const long cx_offset,
                                const long i_offset,
                                const long f_offset,
@@ -88,7 +90,7 @@ __kernel void LSTMFwdHidUpdate(const global _FLOAT* cx,
                                const int cur_batch,
                                const int use_batch)
 {
-    int total_item     = cur_batch * HY_H / RD_BLCK;
+    int total_item     = cur_batch * hy_h / RD_BLCK;
     total_item         = max(total_item, 1);
     _FLOAT activ_param = 1;
 
@@ -103,9 +105,9 @@ __kernel void LSTMFwdHidUpdate(const global _FLOAT* cx,
 
     for(int gid = get_global_id(0); gid < total_item; gid += get_global_size(0))
     {
-        int b_idx   = gid * RD_BLCK / HY_H;
-        int h_idx   = gid * RD_BLCK - b_idx * HY_H;
-        int rsv_idx = b_idx * HY_STRIDE + h_idx;
+        int b_idx   = gid * RD_BLCK / hy_h;
+        int h_idx   = gid * RD_BLCK - b_idx * hy_h;
+        int rsv_idx = b_idx * hy_stride + h_idx;
 
         *((READ_TYPE*)s_dat) = *((const global READ_TYPE*)(reservespace + i_offset + rsv_idx));
         ActivationFunction_Sigmoid(
@@ -175,7 +177,7 @@ __kernel void LSTMFwdHidUpdate(const global _FLOAT* cx,
 
         *((global READ_TYPE*)(reservespace + cell_offset + rsv_idx)) = *((READ_TYPE*)s_dat);
 #if !INFERENCE_MODE
-        *((global READ_TYPE*)(reservespace + activ_cell_offset + b_idx * HY_STRIDE / 6 + h_idx)) =
+        *((global READ_TYPE*)(reservespace + activ_cell_offset + b_idx * hy_stride / 6 + h_idx)) =
             *((READ_TYPE*)cx_dat);
 #endif
         for(int i = 0; i < RD_BLCK; ++i)
@@ -193,6 +195,8 @@ __kernel void LSTMBwdHidUpdate(const global _FLOAT* cx,
                                const global _FLOAT* dcy,
                                global _FLOAT* reservespace,
                                global _FLOAT* workspace,
+                               const int hy_h,
+                               const int hy_stride,
                                const long cx_offset,
                                const long dcy_offset,
                                const long i_offset,
@@ -218,7 +222,7 @@ __kernel void LSTMBwdHidUpdate(const global _FLOAT* cx,
                                const int use_batch,
                                const int use_batch2)
 {
-    int total_item     = cur_batch * HY_H / RD_BLCK;
+    int total_item     = cur_batch * hy_h / RD_BLCK;
     total_item         = max(total_item, 1);
     _FLOAT activ_param = 1;
 
@@ -241,9 +245,9 @@ __kernel void LSTMBwdHidUpdate(const global _FLOAT* cx,
 
     for(int gid = get_global_id(0); gid < total_item; gid += get_global_size(0))
     {
-        int b_idx   = gid * RD_BLCK / HY_H;
-        int h_idx   = gid * RD_BLCK - b_idx * HY_H;
-        int rsv_idx = b_idx * HY_STRIDE + h_idx;
+        int b_idx   = gid * RD_BLCK / hy_h;
+        int h_idx   = gid * RD_BLCK - b_idx * hy_h;
+        int rsv_idx = b_idx * hy_stride + h_idx;
 
         *((READ_TYPE*)dh_dat) = *((const global READ_TYPE*)(workspace + dhidden_offset + rsv_idx));
         *((READ_TYPE*)o_dat)  = *((const global READ_TYPE*)(reservespace + o_offset + rsv_idx));
@@ -258,7 +262,7 @@ __kernel void LSTMBwdHidUpdate(const global _FLOAT* cx,
         }
 
         *((READ_TYPE*)cx_dat) = *((const global READ_TYPE*)(reservespace + activ_cell_offset +
-                                                            b_idx * HY_STRIDE / 6 + h_idx));
+                                                            b_idx * hy_stride / 6 + h_idx));
 
         ActivationFunction_TanH_Diff(RD_BLCK,
                                      dcx_dat,