Skip to content

Commit

Permalink
Merge remote-tracking branch 'origin/develop' into igemm_dynamic_v4r1…
Browse files Browse the repository at this point in the history
…_bwd
  • Loading branch information
carlushuang committed Jul 26, 2020
2 parents 0c02219 + 88e663c commit bcc1bfe
Show file tree
Hide file tree
Showing 24 changed files with 1,817 additions and 1,134 deletions.
12 changes: 11 additions & 1 deletion Jenkinsfile
Original file line number Diff line number Diff line change
Expand Up @@ -430,8 +430,18 @@ pipeline {

stage('Bfloat16 Hip Release All') {
agent{ label rocmnode("vega20") }
environment{
cmd = """
ulimit -c unlimited
rm -rf build
mkdir build
cd build
CXX=/opt/rocm/llvm/bin/clang++ cmake -DMIOPEN_TEST_BFLOAT16=On -DMIOPEN_TEST_ALL=On -DBUILD_DEV=On -DCMAKE_BUILD_TYPE=release -DMIOPEN_GPU_SYNC=On ..
make -j test_conv2d
"""
}
steps{
buildJob('hcc', '-DMIOPEN_TEST_BFLOAT16=On -DBUILD_DEV=On -DMIOPEN_TEST_ALL=On -DCMAKE_BUILD_TYPE=release', "", image + "rocm")
buildHipClangJob('/opt/rocm/llvm/bin/clang++', '', "", image+'-hip-clang', "/usr/local", cmd)
}
}

Expand Down
4 changes: 2 additions & 2 deletions src/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -220,10 +220,10 @@ set( MIOpen_Source
solver/conv_hip_implicit_gemm_nonxdlops_common.cpp
solver/conv_hip_implicit_gemm_bwd_data_v1r1.cpp
solver/conv_hip_implicit_gemm_bwd_data_v4r1.cpp
solver/conv_hip_implicit_gemm_bwd_data_v1r1_xdlops.cpp
solver/conv_asm_implicit_gemm_v4r1_dynamic.cpp
solver/conv_hip_implicit_gemm_bwd_data_v4r1_xdlops.cpp
solver/conv_asm_implicit_gemm_v4r1_dynamic.cpp
solver/conv_hip_implicit_gemm_v4r4_gen_xdlops_fwd_fp32.cpp
solver/conv_hip_implicit_gemm_bwd_v1r1_xdlops_nchw_kcyx_nkhw.cpp
solver/conv_asm_implicit_gemm_bwd_v4r1_dynamic.cpp
)

Expand Down
56 changes: 37 additions & 19 deletions src/conv/invokers/impl_gemm.cpp
100755 → 100644
Original file line number Diff line number Diff line change
Expand Up @@ -50,27 +50,45 @@ InvokerFactory MakeImplGemmDataInvokerFactory(const ConvolutionContext& ctx)
kernel.GetName() == "gridwise_convolution_backward_data_implicit_gemm_v1r1_ncdhw_kczyx_nkdhw"))
// clang-format on
{
float zero = 0.f;
TensorDescriptor workspaceDesc(
miopenFloat, tensors.outDesc.GetLengths(), tensors.outDesc.GetStrides());
SetTensor(handle, workspaceDesc, workSpace, &zero);
if(handle.IsProfilingEnabled())
elapsed += handle.GetKernelTime();
const auto y = tensors.wDesc.GetLengths()[2];
const auto x = tensors.wDesc.GetLengths()[3];
const auto stride_h = conv.GetConvStrides()[0];
const auto stride_w = conv.GetConvStrides()[1];
const auto dilation_h = conv.GetConvDilations()[0];
const auto dilation_w = conv.GetConvDilations()[1];

if((stride_h >= dilation_h * (y - 1) + 1) &&
(stride_w >= dilation_w * (x - 1) + 1))
{
kernel(tensors.in, tensors.w, tensors.out);
if(handle.IsProfilingEnabled())
elapsed += handle.GetKernelTime();
}
else
{
float zero = 0.f;
TensorDescriptor workspaceDesc(miopenFloat,
tensors.outDesc.GetLengths(),
tensors.outDesc.GetStrides());
SetTensor(handle, workspaceDesc, workSpace, &zero);
if(handle.IsProfilingEnabled())
elapsed += handle.GetKernelTime();

kernel(tensors.in, tensors.w, workSpace);
if(handle.IsProfilingEnabled())
elapsed += handle.GetKernelTime();
kernel(tensors.in, tensors.w, workSpace);
if(handle.IsProfilingEnabled())
elapsed += handle.GetKernelTime();

CastTensor(handle,
&lowp_quant,
workspaceDesc,
workSpace,
tensors.outDesc,
tensors.out,
0,
0);
if(handle.IsProfilingEnabled())
elapsed += handle.GetKernelTime();
CastTensor(handle,
&lowp_quant,
workspaceDesc,
workSpace,
tensors.outDesc,
tensors.out,
0,
0);
if(handle.IsProfilingEnabled())
elapsed += handle.GetKernelTime();
}
}
// clang-format off
else if(kernel.GetName() == "gridwise_convolution_backward_data_implicit_gemm_v1r1_xdlops_nchw_kcyx_nkhw" ||
Expand Down
56 changes: 52 additions & 4 deletions src/include/miopen/solver.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -1007,6 +1007,53 @@ struct PerformanceImplicitGemmForwardV4R4Xdlops
std::tuple<std::size_t, bool> CalculateLdsNumberOfByte(const ConvolutionContext& ctx) const;
};

struct PerformanceImplicitGemmBwdV1R1Xdlops : Serializable<PerformanceImplicitGemmBwdV1R1Xdlops>
{
int GemmMPerBlock;
int GemmNPerBlock;
int GemmKPerBlock;
int GemmMPerWave;
int GemmNPerWave;
int GemmKPack;
bool GemmAThreadCopyMoreGemmK;
bool GemmBThreadCopyMoreGemmKPack;

PerformanceImplicitGemmBwdV1R1Xdlops(int, int, int, int, int, int, bool, bool);
PerformanceImplicitGemmBwdV1R1Xdlops();
PerformanceImplicitGemmBwdV1R1Xdlops(bool) : PerformanceImplicitGemmBwdV1R1Xdlops() {}

template <class Self, class F>
static void Visit(Self&& self, F f)
{
f(self.GemmMPerBlock, "GemmMPerBlock");
f(self.GemmNPerBlock, "GemmNPerBlock");
f(self.GemmKPerBlock, "GemmKPerBlock");
f(self.GemmMPerWave, "GemmMPerWave");
f(self.GemmNPerWave, "GemmNPerWave");
f(self.GemmKPack, "GemmKPack");
f(self.GemmAThreadCopyMoreGemmK, "GemmAThreadCopyMoreGemmK");
f(self.GemmBThreadCopyMoreGemmKPack, "GemmBThreadCopyMoreGemmKPack");
}

bool operator==(const PerformanceImplicitGemmBwdV1R1Xdlops& other) const;
std::string ToString() const;

void EuristicInit(const ConvolutionContext& ctx);
bool SetNextValue();
bool IsValidValue() const;
bool IsValid(const ConvolutionContext& ctx) const;
bool IsReallyValid(const ConvolutionContext& ctx) const;
bool IsFastToBeUsedForTuning(const ConvolutionContext& ctx) const;

std::tuple<int, bool> CalculateBlockSize() const;
std::tuple<int, bool> CalculateGridSize(const ConvolutionContext& ctx) const;
std::tuple<int, int, int, int, int, bool>
CalculateGemmABlockCopyPerformanceParameters(const ConvolutionContext& ctx) const;
std::tuple<int, int, int, int, int, bool>
CalculateGemmBBlockCopyPerformanceParameters(const ConvolutionContext& ctx) const;
std::tuple<std::size_t, bool> CalculateLdsNumberOfByte(const ConvolutionContext& ctx) const;
};

struct ConvHipImplicitGemmV4R4GenFwdXdlops : SolverBase<ConvolutionContext>
{
PerformanceImplicitGemmXdlops GetPerformanceConfig(const ConvolutionContext& ctx) const;
Expand Down Expand Up @@ -1231,15 +1278,16 @@ struct ConvHipImplicitGemmBwdDataV4R1Xdlops : SolverBase<ConvolutionContext>

struct ConvHipImplicitGemmBwdDataV1R1Xdlops : SolverBase<ConvolutionContext>
{
PerformanceImplicitGemmXdlops GetPerformanceConfig(const ConvolutionContext& ctx) const;
static std::tuple<int, int, int, int> CalculateGemmSize(const ConvolutionContext& ctx);
PerformanceImplicitGemmBwdV1R1Xdlops GetPerformanceConfig(const ConvolutionContext& ctx) const;
bool IsValidPerformanceConfig(const ConvolutionContext& ctx,
const PerformanceImplicitGemmXdlops& c) const;
const PerformanceImplicitGemmBwdV1R1Xdlops& c) const;
bool IsApplicable(const ConvolutionContext& ctx) const;
size_t GetWorkspaceSize(const ConvolutionContext& ctx) const;
ConvSolution GetSolution(const ConvolutionContext& ctx,
const PerformanceImplicitGemmXdlops& config,
const PerformanceImplicitGemmBwdV1R1Xdlops& config,
bool disableConfigOverrideFromEnv = false) const;
PerformanceImplicitGemmXdlops Search(const ConvolutionContext&) const;
PerformanceImplicitGemmBwdV1R1Xdlops Search(const ConvolutionContext&) const;
int RunAndMeasureSolution(const miopen::Handle& profile_h,
ConstData_t bot_buf,
Data_t top_buf,
Expand Down
26 changes: 15 additions & 11 deletions src/kernels/MIOpenRNNHiddenStateUpdate.cl
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
*
* MIT License
*
* Copyright (c) 2019 Advanced Micro Devices, Inc.
* Copyright (c) 2020 Advanced Micro Devices, Inc.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
Expand Down Expand Up @@ -70,6 +70,8 @@

__kernel void LSTMFwdHidUpdate(const global _FLOAT* cx,
global _FLOAT* reservespace,
const int hy_h,
const int hy_stride,
const long cx_offset,
const long i_offset,
const long f_offset,
Expand All @@ -88,7 +90,7 @@ __kernel void LSTMFwdHidUpdate(const global _FLOAT* cx,
const int cur_batch,
const int use_batch)
{
int total_item = cur_batch * HY_H / RD_BLCK;
int total_item = cur_batch * hy_h / RD_BLCK;
total_item = max(total_item, 1);
_FLOAT activ_param = 1;

Expand All @@ -103,9 +105,9 @@ __kernel void LSTMFwdHidUpdate(const global _FLOAT* cx,

for(int gid = get_global_id(0); gid < total_item; gid += get_global_size(0))
{
int b_idx = gid * RD_BLCK / HY_H;
int h_idx = gid * RD_BLCK - b_idx * HY_H;
int rsv_idx = b_idx * HY_STRIDE + h_idx;
int b_idx = gid * RD_BLCK / hy_h;
int h_idx = gid * RD_BLCK - b_idx * hy_h;
int rsv_idx = b_idx * hy_stride + h_idx;

*((READ_TYPE*)s_dat) = *((const global READ_TYPE*)(reservespace + i_offset + rsv_idx));
ActivationFunction_Sigmoid(
Expand Down Expand Up @@ -175,7 +177,7 @@ __kernel void LSTMFwdHidUpdate(const global _FLOAT* cx,

*((global READ_TYPE*)(reservespace + cell_offset + rsv_idx)) = *((READ_TYPE*)s_dat);
#if !INFERENCE_MODE
*((global READ_TYPE*)(reservespace + activ_cell_offset + b_idx * HY_STRIDE / 6 + h_idx)) =
*((global READ_TYPE*)(reservespace + activ_cell_offset + b_idx * hy_stride / 6 + h_idx)) =
*((READ_TYPE*)cx_dat);
#endif
for(int i = 0; i < RD_BLCK; ++i)
Expand All @@ -193,6 +195,8 @@ __kernel void LSTMBwdHidUpdate(const global _FLOAT* cx,
const global _FLOAT* dcy,
global _FLOAT* reservespace,
global _FLOAT* workspace,
const int hy_h,
const int hy_stride,
const long cx_offset,
const long dcy_offset,
const long i_offset,
Expand All @@ -218,7 +222,7 @@ __kernel void LSTMBwdHidUpdate(const global _FLOAT* cx,
const int use_batch,
const int use_batch2)
{
int total_item = cur_batch * HY_H / RD_BLCK;
int total_item = cur_batch * hy_h / RD_BLCK;
total_item = max(total_item, 1);
_FLOAT activ_param = 1;

Expand All @@ -241,9 +245,9 @@ __kernel void LSTMBwdHidUpdate(const global _FLOAT* cx,

for(int gid = get_global_id(0); gid < total_item; gid += get_global_size(0))
{
int b_idx = gid * RD_BLCK / HY_H;
int h_idx = gid * RD_BLCK - b_idx * HY_H;
int rsv_idx = b_idx * HY_STRIDE + h_idx;
int b_idx = gid * RD_BLCK / hy_h;
int h_idx = gid * RD_BLCK - b_idx * hy_h;
int rsv_idx = b_idx * hy_stride + h_idx;

*((READ_TYPE*)dh_dat) = *((const global READ_TYPE*)(workspace + dhidden_offset + rsv_idx));
*((READ_TYPE*)o_dat) = *((const global READ_TYPE*)(reservespace + o_offset + rsv_idx));
Expand All @@ -258,7 +262,7 @@ __kernel void LSTMBwdHidUpdate(const global _FLOAT* cx,
}

*((READ_TYPE*)cx_dat) = *((const global READ_TYPE*)(reservespace + activ_cell_offset +
b_idx * HY_STRIDE / 6 + h_idx));
b_idx * hy_stride / 6 + h_idx));

ActivationFunction_TanH_Diff(RD_BLCK,
dcx_dat,
Expand Down
Loading

0 comments on commit bcc1bfe

Please sign in to comment.