Skip to content

Commit

Permalink
Merge branch 'develop' into atamazov_remove-workaround-898
Browse files Browse the repository at this point in the history
  • Loading branch information
junliume committed Apr 12, 2022
2 parents 3766dcb + 3d16ede commit 40cb105
Show file tree
Hide file tree
Showing 33 changed files with 1,653 additions and 971 deletions.
2 changes: 1 addition & 1 deletion CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -86,7 +86,7 @@ if(NOT WIN32 AND NOT APPLE)
set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -s")
endif()

rocm_setup_version(VERSION 2.16.0)
rocm_setup_version(VERSION 2.17.0)

list( APPEND CMAKE_MODULE_PATH ${PROJECT_SOURCE_DIR}/cmake )
include(TargetFlags)
Expand Down
2 changes: 1 addition & 1 deletion dev-requirements.txt
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
ROCmSoftwarePlatform/rocm-recipes
RadeonOpenCompute/rocm-cmake@4a1514850195360ce772182d34b6b5afd8f37253 --build
RadeonOpenCompute/rocm-cmake@bd4e360c73fa366f817f5aa013433e799d8cd659 --build
-f requirements.txt
# 1.90+
danmar/cppcheck@dd05839a7e63ef04afd34711cb3e1e0ef742882f
2 changes: 1 addition & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
RadeonOpenCompute/rocm-cmake@4a1514850195360ce772182d34b6b5afd8f37253 --build
RadeonOpenCompute/rocm-cmake@bd4e360c73fa366f817f5aa013433e799d8cd659 --build
sqlite3@3.17 -DCMAKE_POSITION_INDEPENDENT_CODE=On
boost@1.72 -DCMAKE_POSITION_INDEPENDENT_CODE=On --build
half,https://github.com/pfultz2/half/archive/1.12.0.tar.gz -X header -H sha256:0a08660b68abb176ebc2a0cdf8de46e3182a7f46c66443bb80dbfaaec98cf969 --build
Expand Down
22 changes: 20 additions & 2 deletions src/hip/general_tensor_reorder_sol.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,17 @@ static inline std::string GetKernelNameType(std::size_t type_size)
MIOPEN_THROW("data type not supported");
}

static inline std::string GetKernelFileName(std::size_t data_size,
const GeneralReorderParam* kparam)
{
if(kparam == nullptr)
MIOPEN_THROW("Memory access fault, kparam is a nullptr");
std::ostringstream kernel_file_name;
kernel_file_name << "general_tensor_reorder_" << kparam->tile_x << "x" << kparam->tile_y << "_";
kernel_file_name << GetKernelNameType(data_size) << ".cpp";
return kernel_file_name.str();
}

static inline std::string GetKernelName(std::size_t data_size,
uint32_t order_0,
uint32_t order_1,
Expand Down Expand Up @@ -140,9 +151,10 @@ solver::KernelInfo GenericReorderSolutionImpl::GetKernelInfo() const
(block_size * kernel_param_heuristic.tile_x);
std::size_t grid_size = dim_total;

std::string kernel_name = GetKernelName();
std::string kernel_name = GetKernelName();
std::string kernel_file_name = GetKernelFileName();
solver::KernelInfo kernel;
kernel.kernel_file = "general_tensor_reorder.cpp";
kernel.kernel_file = kernel_file_name;
kernel.kernel_name = kernel_name;
kernel.g_wk.clear();
kernel.g_wk.push_back(grid_size * block_size);
Expand Down Expand Up @@ -191,6 +203,12 @@ std::vector<OpKernelArg> GenericReorderSolutionImpl::GetKernelArg() const
return opArgs;
}

std::string GenericReorderSolutionImpl::GetKernelFileName() const
{
return tensor_reorder::GetKernelFileName(miopen::GetTypeSize(data_type),
&kernel_param_heuristic);
}

std::string GenericReorderSolutionImpl::GetKernelName() const
{
return tensor_reorder::GetKernelName(miopen::GetTypeSize(data_type),
Expand Down
1 change: 1 addition & 0 deletions src/include/miopen/general_tensor_reorder_sol.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,7 @@ struct GenericReorderSolutionImpl
// TODO batched transpose API
solver::KernelInfo GetKernelInfo() const;
std::vector<OpKernelArg> GetKernelArg() const;
std::string GetKernelFileName() const;
std::string GetKernelName() const;
bool IsSkippable() const;
size_t GetOutputTensorSize() const;
Expand Down
38 changes: 30 additions & 8 deletions src/include/miopen/tensor_reorder_util.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -169,26 +169,49 @@ struct GenericReorderSolution : TensorReorderAttributesBase
inline std::unique_ptr<TensorReorderAttributesBase>
MakeTensorReorderAttributes(const ExecutionContext& ctx_,
miopenDataType_t data_type_,
uint32_t dim_0_,
uint32_t dim_1_,
uint32_t dim_2_,
uint32_t dim_3_,
uint64_t dim_0_,
uint64_t dim_1_,
uint64_t dim_2_,
uint64_t dim_3_,
uint32_t order_0_,
uint32_t order_1_,
uint32_t order_2_,
uint32_t order_3_)
{
std::unique_ptr<TensorReorderAttributesBase> default_ptr;
if(!ctx_.use_hip_kernels)
{
return default_ptr;
return nullptr;
}
///\todo 64-bit tensor dimension
// Currentlly we have tensor dimension limited to 2^32 - 1
if((dim_0_ > UINT32_MAX) || (dim_1_ > UINT32_MAX) || (dim_2_ > UINT32_MAX) ||
(dim_3_ > UINT32_MAX))
{
MIOPEN_THROW("Currentlly we have tensor dimension limitation of 2^32 - 1");
}
// Default using general reorder
if(data_type_ == miopenBFloat16)
{
MIOPEN_THROW("Unsupported reorder data type");
}
// Check required reorder is legal or not
std::vector<std::vector<uint32_t>> all_possible_order{
{0, 1, 3, 2}, {0, 2, 1, 3}, {0, 2, 3, 1}, {0, 3, 1, 2}, {0, 3, 2, 1}, {1, 0, 2, 3},
{1, 0, 3, 2}, {1, 2, 0, 3}, {1, 2, 3, 0}, {1, 3, 0, 2}, {1, 3, 2, 0}, {2, 0, 1, 3},
{2, 0, 3, 1}, {2, 1, 0, 3}, {2, 1, 3, 0}, {2, 3, 0, 1}, {2, 3, 1, 0}, {3, 0, 1, 2},
{3, 0, 2, 1}, {3, 1, 0, 2}, {3, 1, 2, 0}, {3, 2, 0, 1}, {3, 2, 1, 0}};
auto found = std::find_if(
all_possible_order.begin(),
all_possible_order.end(),
[order_0_, order_1_, order_2_, order_3_](std::vector<uint32_t> possible_order) {
return (possible_order[0] == order_0_) && (possible_order[1] == order_1_) &&
(possible_order[2] == order_2_) && (possible_order[3] == order_3_);
});
if(found == all_possible_order.end())
MIOPEN_THROW("Unsupported Reorder");

int which = 0;
// Special cases that utilize batched transpose kernel
if(data_type_ != miopenDouble)
{
if((order_0_ == 0) && (order_1_ == 1) && (order_2_ == 3) && (order_3_ == 2))
Expand Down Expand Up @@ -235,9 +258,8 @@ MakeTensorReorderAttributes(const ExecutionContext& ctx_,
case 5:
return std::make_unique<BatchedTransposeSolution_3012>(
ctx_, data_type_, dim_0_, dim_1_, dim_2_, dim_3_);
default: MIOPEN_THROW("Unsupported reorder sequence"); break;
default: MIOPEN_THROW("Failed to call tensor reorder solver");
}
return default_ptr;
}

} // namespace miopen
Expand Down
139 changes: 131 additions & 8 deletions src/kernels/MIOpenTensorKernels.cl
Original file line number Diff line number Diff line change
Expand Up @@ -635,12 +635,43 @@ __kernel void Op2dTensorSquash(const global MIOPEN_TYPE* a,
const int use_bet)
{
MIOPEN_TYPE a_dat[RD_BLCK];
MIOPEN_TYPE b_dat[RD_BLCK];
MIOPEN_TYPE b_dat1[RD_BLCK];
MIOPEN_TYPE b_dat2[RD_BLCK];
MIOPEN_TYPE b_dat3[RD_BLCK];
MIOPEN_TYPE b_dat4[RD_BLCK];
MIOPEN_TYPE b_dat5[RD_BLCK];
MIOPEN_TYPE b_dat6[RD_BLCK];
MIOPEN_TYPE b_dat7[RD_BLCK];
MIOPEN_TYPE b_dat8[RD_BLCK];
MIOPEN_TYPE b_dat9[RD_BLCK];
MIOPEN_TYPE b_dat10[RD_BLCK];
MIOPEN_TYPE b_dat11[RD_BLCK];
MIOPEN_TYPE b_dat12[RD_BLCK];
MIOPEN_TYPE b_dat13[RD_BLCK];
MIOPEN_TYPE b_dat14[RD_BLCK];
MIOPEN_TYPE b_dat15[RD_BLCK];
MIOPEN_TYPE b_dat16[RD_BLCK];
MIOPEN_TYPE c_dat[RD_BLCK];
int g_RD_BLCK;

for(int i = 0; i < RD_BLCK; ++i)
{
b_dat[i] = (MIOPEN_TYPE)0;
b_dat1[i] = (MIOPEN_TYPE)0;
b_dat2[i] = (MIOPEN_TYPE)0;
b_dat3[i] = (MIOPEN_TYPE)0;
b_dat4[i] = (MIOPEN_TYPE)0;
b_dat5[i] = (MIOPEN_TYPE)0;
b_dat6[i] = (MIOPEN_TYPE)0;
b_dat7[i] = (MIOPEN_TYPE)0;
b_dat8[i] = (MIOPEN_TYPE)0;
b_dat9[i] = (MIOPEN_TYPE)0;
b_dat10[i] = (MIOPEN_TYPE)0;
b_dat11[i] = (MIOPEN_TYPE)0;
b_dat12[i] = (MIOPEN_TYPE)0;
b_dat13[i] = (MIOPEN_TYPE)0;
b_dat14[i] = (MIOPEN_TYPE)0;
b_dat15[i] = (MIOPEN_TYPE)0;
b_dat16[i] = (MIOPEN_TYPE)0;
}

for(int gid = get_global_id(0); gid < total_work; gid += get_global_size(0))
Expand Down Expand Up @@ -670,16 +701,108 @@ __kernel void Op2dTensorSquash(const global MIOPEN_TYPE* a,
}
}

for(int bid = 0; bid < b_c; bid++)
g_RD_BLCK = gid * RD_BLCK;
if(use_apl1 == 1)
{
if(use_apl1 == 1)
for(int bid = 0; bid < ((b_c / 16) * 16); bid += 16)
{
int b_index = bid * b_nstride + gid * RD_BLCK;
*((READ_TYPE*)b_dat) = *((const global READ_TYPE*)(b + Boffset + b_index));

int b_index1 = (bid * b_nstride) + g_RD_BLCK;
int b_index2 = ((bid + 1) * b_nstride) + g_RD_BLCK;
int b_index3 = ((bid + 2) * b_nstride) + g_RD_BLCK;
int b_index4 = ((bid + 3) * b_nstride) + g_RD_BLCK;
int b_index5 = ((bid + 4) * b_nstride) + g_RD_BLCK;
int b_index6 = ((bid + 5) * b_nstride) + g_RD_BLCK;
int b_index7 = ((bid + 6) * b_nstride) + g_RD_BLCK;
int b_index8 = ((bid + 7) * b_nstride) + g_RD_BLCK;
int b_index9 = ((bid + 8) * b_nstride) + g_RD_BLCK;
int b_index10 = ((bid + 9) * b_nstride) + g_RD_BLCK;
int b_index11 = ((bid + 10) * b_nstride) + g_RD_BLCK;
int b_index12 = ((bid + 11) * b_nstride) + g_RD_BLCK;
int b_index13 = ((bid + 12) * b_nstride) + g_RD_BLCK;
int b_index14 = ((bid + 13) * b_nstride) + g_RD_BLCK;
int b_index15 = ((bid + 14) * b_nstride) + g_RD_BLCK;
int b_index16 = ((bid + 15) * b_nstride) + g_RD_BLCK;
*((READ_TYPE*)b_dat1) = *((const global READ_TYPE*)(b + Boffset + b_index1));
*((READ_TYPE*)b_dat2) = *((const global READ_TYPE*)(b + Boffset + b_index2));
*((READ_TYPE*)b_dat3) = *((const global READ_TYPE*)(b + Boffset + b_index3));
*((READ_TYPE*)b_dat4) = *((const global READ_TYPE*)(b + Boffset + b_index4));
*((READ_TYPE*)b_dat5) = *((const global READ_TYPE*)(b + Boffset + b_index5));
*((READ_TYPE*)b_dat6) = *((const global READ_TYPE*)(b + Boffset + b_index6));
*((READ_TYPE*)b_dat7) = *((const global READ_TYPE*)(b + Boffset + b_index7));
*((READ_TYPE*)b_dat8) = *((const global READ_TYPE*)(b + Boffset + b_index8));
*((READ_TYPE*)b_dat9) = *((const global READ_TYPE*)(b + Boffset + b_index9));
*((READ_TYPE*)b_dat10) = *((const global READ_TYPE*)(b + Boffset + b_index10));
*((READ_TYPE*)b_dat11) = *((const global READ_TYPE*)(b + Boffset + b_index11));
*((READ_TYPE*)b_dat12) = *((const global READ_TYPE*)(b + Boffset + b_index12));
*((READ_TYPE*)b_dat13) = *((const global READ_TYPE*)(b + Boffset + b_index13));
*((READ_TYPE*)b_dat14) = *((const global READ_TYPE*)(b + Boffset + b_index14));
*((READ_TYPE*)b_dat15) = *((const global READ_TYPE*)(b + Boffset + b_index15));
*((READ_TYPE*)b_dat16) = *((const global READ_TYPE*)(b + Boffset + b_index16));

for(int i = 0; i < RD_BLCK; ++i)
{
c_dat[i] += MIOPEN_TENSOR_OP(a_dat[i], b_dat1[i] * alpha1);
c_dat[i] += MIOPEN_TENSOR_OP(a_dat[i], b_dat2[i] * alpha1);
c_dat[i] += MIOPEN_TENSOR_OP(a_dat[i], b_dat3[i] * alpha1);
c_dat[i] += MIOPEN_TENSOR_OP(a_dat[i], b_dat4[i] * alpha1);
c_dat[i] += MIOPEN_TENSOR_OP(a_dat[i], b_dat5[i] * alpha1);
c_dat[i] += MIOPEN_TENSOR_OP(a_dat[i], b_dat6[i] * alpha1);
c_dat[i] += MIOPEN_TENSOR_OP(a_dat[i], b_dat7[i] * alpha1);
c_dat[i] += MIOPEN_TENSOR_OP(a_dat[i], b_dat8[i] * alpha1);
c_dat[i] += MIOPEN_TENSOR_OP(a_dat[i], b_dat9[i] * alpha1);
c_dat[i] += MIOPEN_TENSOR_OP(a_dat[i], b_dat10[i] * alpha1);
c_dat[i] += MIOPEN_TENSOR_OP(a_dat[i], b_dat11[i] * alpha1);
c_dat[i] += MIOPEN_TENSOR_OP(a_dat[i], b_dat12[i] * alpha1);
c_dat[i] += MIOPEN_TENSOR_OP(a_dat[i], b_dat13[i] * alpha1);
c_dat[i] += MIOPEN_TENSOR_OP(a_dat[i], b_dat14[i] * alpha1);
c_dat[i] += MIOPEN_TENSOR_OP(a_dat[i], b_dat15[i] * alpha1);
c_dat[i] += MIOPEN_TENSOR_OP(a_dat[i], b_dat16[i] * alpha1);
}
}
for(int i = 0; i < RD_BLCK; ++i)
for(int bid = ((b_c / 16) * 16); bid < b_c; bid++)
{
c_dat[i] += MIOPEN_TENSOR_OP(a_dat[i], b_dat[i] * alpha1);
int b_index = bid * b_nstride + g_RD_BLCK;
*((READ_TYPE*)b_dat1) = *((const global READ_TYPE*)(b + Boffset + b_index));

for(int i = 0; i < RD_BLCK; ++i)
{
c_dat[i] += MIOPEN_TENSOR_OP(a_dat[i], b_dat1[i] * alpha1);
}
}
}
else
{
for(int bid = 0; bid < ((b_c / 16) * 16); bid += 16)
{

for(int i = 0; i < RD_BLCK; ++i)
{
c_dat[i] += MIOPEN_TENSOR_OP(a_dat[i], 0);
c_dat[i] += MIOPEN_TENSOR_OP(a_dat[i], 0);
c_dat[i] += MIOPEN_TENSOR_OP(a_dat[i], 0);
c_dat[i] += MIOPEN_TENSOR_OP(a_dat[i], 0);
c_dat[i] += MIOPEN_TENSOR_OP(a_dat[i], 0);
c_dat[i] += MIOPEN_TENSOR_OP(a_dat[i], 0);
c_dat[i] += MIOPEN_TENSOR_OP(a_dat[i], 0);
c_dat[i] += MIOPEN_TENSOR_OP(a_dat[i], 0);
c_dat[i] += MIOPEN_TENSOR_OP(a_dat[i], 0);
c_dat[i] += MIOPEN_TENSOR_OP(a_dat[i], 0);
c_dat[i] += MIOPEN_TENSOR_OP(a_dat[i], 0);
c_dat[i] += MIOPEN_TENSOR_OP(a_dat[i], 0);
c_dat[i] += MIOPEN_TENSOR_OP(a_dat[i], 0);
c_dat[i] += MIOPEN_TENSOR_OP(a_dat[i], 0);
c_dat[i] += MIOPEN_TENSOR_OP(a_dat[i], 0);
c_dat[i] += MIOPEN_TENSOR_OP(a_dat[i], 0);
}
}
for(int bid = ((b_c / 16) * 16); bid < b_c; bid++)
{

for(int i = 0; i < RD_BLCK; ++i)
{
c_dat[i] += MIOPEN_TENSOR_OP(a_dat[i], 0);
}
}
}
*((global READ_TYPE*)(c + Coffset + io_index)) = *((READ_TYPE*)c_dat);
Expand Down
Loading

0 comments on commit 40cb105

Please sign in to comment.