Skip to content

Commit

Permalink
Merge pull request ROCm#55 from AMDComputeLibraries/develop
Browse files Browse the repository at this point in the history
up to dev
  • Loading branch information
ce1adon authored Jun 25, 2018
2 parents 3c586fe + 512f9f2 commit db83eda
Show file tree
Hide file tree
Showing 15 changed files with 782 additions and 484 deletions.
4 changes: 2 additions & 2 deletions src/convolution.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -207,7 +207,7 @@ size_t ConvolutionDescriptor::ForwardGetWorkSpaceSizeGEMM(Handle& handle,
size_t workspace_size = wei_c * wei_h * wei_w * out_h * out_w * GetTypeSize(yDesc.GetType());

// No workspace is needed for 1x1_stride=1 convolutions
if(wei_h == 1 && wei_w == 1 && u == 1 && v == 1)
if(wei_h == 1 && wei_w == 1 && u == 1 && v == 1 && pad_h == 0 && pad_w == 0)
{
return 0;
}
Expand Down Expand Up @@ -567,7 +567,7 @@ size_t ConvolutionDescriptor::BackwardDataGetWorkSpaceSizeGEMM(Handle& handle,
size_t gemm_size = wei_c * wei_h * wei_w * out_h * out_w * GetTypeSize(dyDesc.GetType());

// No workspace is needed for 1x1_stride=1 convolutions
if(wei_h == 1 && wei_w == 1 && u == 1 && v == 1)
if(wei_h == 1 && wei_w == 1 && u == 1 && v == 1 && pad_h == 0 && pad_w == 0)
{
return 0;
}
Expand Down
7 changes: 7 additions & 0 deletions src/include/miopen/functional.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -97,6 +97,13 @@ sequence_t<F> sequence(F f)
return {std::move(f)};
}

template <typename F, std::size_t N>
void repeat_n(F f, std::integral_constant<std::size_t, N>)
{
auto fs = [&f](auto... is) { return each_args(f, is...); };
sequence(fs)(std::integral_constant<std::size_t, N>{});
}

template <class T>
struct cast_to
{
Expand Down
31 changes: 16 additions & 15 deletions src/include/miopen/solver.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -68,12 +68,13 @@ MIOPEN_DECLARE_ENV_VAR(MIOPEN_DEBUG_AMD_ASM_KERNELS_PERF_FILTERING)
/// may fail (1) during exhaustive search, (2) during compilation,
/// (3) on execution (like LDS overallocation) or (4) may reveal precision
/// problems. These problems impedes finding and using the really fastest OpenCL solution.
MIOPEN_DECLARE_ENV_VAR(
MIOPEN_OPENCL_WORKAROUND_FIND_ALL_CONV_DIRECT_FWD) /// \todo Fix & remove the workaround.
MIOPEN_DECLARE_ENV_VAR(
MIOPEN_OPENCL_WORKAROUND_FIND_ALL_CONV_DIRECT_BWD) /// \todo Fix & remove the workaround.
MIOPEN_DECLARE_ENV_VAR(
MIOPEN_OPENCL_WORKAROUND_FIND_ALL_CONV_DIRECT_WRW) /// \todo Fix & remove the workaround.

/// \todo Remove env.var (workaround is OFF by default):
MIOPEN_DECLARE_ENV_VAR(MIOPEN_OPENCL_WORKAROUND_FIND_ALL_CONV_DIRECT_FWD)
/// \todo Remove env.var (workaround is OFF by default):
MIOPEN_DECLARE_ENV_VAR(MIOPEN_OPENCL_WORKAROUND_FIND_ALL_CONV_DIRECT_BWD)
/// \todo Remove env.var (workaround is OFF by default):
MIOPEN_DECLARE_ENV_VAR(MIOPEN_OPENCL_WORKAROUND_FIND_ALL_CONV_DIRECT_WRW)

namespace solver {
/// \todo Move wave_size into abstraction wich represent GPU information
Expand Down Expand Up @@ -206,7 +207,7 @@ template <class Solver, class Context, class Db>
auto FindSolutionImpl(rank<0>, Solver s, const Context& context, Db&)
-> decltype(s.GetSolution(context))
{
MIOPEN_LOG_I("Not searchable: " << SolverDbId(s));
MIOPEN_LOG_I(SolverDbId(s) << " (not searchable)");
return s.GetSolution(context);
}

Expand Down Expand Up @@ -304,14 +305,13 @@ std::vector<Solution> SearchForAllSolutions(const Context& search_params, Db db)
/// This is ok so far, as SearchForAllSolutions() is used only for direct
/// convolutions (for now).
if((search_params.direction.IsForward() &&
!miopen::IsDisabled(
miopen::IsEnabled(
MIOPEN_OPENCL_WORKAROUND_FIND_ALL_CONV_DIRECT_FWD{})) ||
(search_params.direction.IsBackwardData() &&
!miopen::IsDisabled(
miopen::IsEnabled(
MIOPEN_OPENCL_WORKAROUND_FIND_ALL_CONV_DIRECT_BWD{})) ||
(search_params.direction.IsBackwardWrW() &&
!miopen::IsDisabled(
MIOPEN_OPENCL_WORKAROUND_FIND_ALL_CONV_DIRECT_WRW{})))
miopen::IsEnabled(MIOPEN_OPENCL_WORKAROUND_FIND_ALL_CONV_DIRECT_WRW{})))
{
skip_the_rest = true;
}
Expand Down Expand Up @@ -558,10 +558,6 @@ struct ConvOclDirectFwd3x3 : SolverBase<ConvolutionContext>
struct ConvOclDirectFwdLegacyExhaustiveSearch : SolverBase<ConvolutionContext>
{
LegacyPerformanceConfig GetPerformanceConfig(const ConvolutionContext&) const;
bool IsValidPerformanceConfig(const ConvolutionContext&, const LegacyPerformanceConfig&) const
{
return true; // Do not check by default.
}
LegacyPerformanceConfig Search(const ConvolutionContext&) const;
};

Expand All @@ -571,13 +567,18 @@ struct ConvOclDirectFwd : ConvOclDirectFwdLegacyExhaustiveSearch

ConvSolution GetSolution(const ConvolutionContext& params,
const LegacyPerformanceConfig& searched_params) const;
bool IsValidPerformanceConfig(const ConvolutionContext&, const LegacyPerformanceConfig&) const;
};

struct ConvOclDirectFwd1x1 : ConvOclDirectFwdLegacyExhaustiveSearch
{
bool IsApplicable(const ConvolutionContext& params) const;
ConvSolution GetSolution(const ConvolutionContext& params,
const LegacyPerformanceConfig& searched_params) const;
bool IsValidPerformanceConfig(const ConvolutionContext&, const LegacyPerformanceConfig&) const
{
return true;
}
};

struct ConvBinWinograd3x3U : SolverBase<ConvolutionContext>
Expand Down
17 changes: 17 additions & 0 deletions src/include/miopen/tensor.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,18 @@ template <std::size_t N, class T, class U>
auto tien(T&& x, U y)
MIOPEN_RETURNS(tie_impl(std::forward<T>(x), y, typename detail::gens<N>::type{}));

template <typename F, std::size_t... Ns>
auto create_tuple_impl(F f, detail::seq<Ns...>)
{
return std::make_tuple(std::forward<decltype(f(Ns))>(f(Ns))...);
}

template <std::size_t N, typename F>
auto create_tuple(F f)
{
return create_tuple_impl(f, typename detail::gens<N>::type{});
}

inline std::size_t GetTypeSize(miopenDataType_t d)
{
switch(d)
Expand All @@ -79,6 +91,11 @@ struct TensorDescriptor : miopenTensorDescriptor
std::initializer_list<std::size_t> pstrides);
TensorDescriptor(miopenDataType_t t, const int* plens, int size);
TensorDescriptor(miopenDataType_t t, const int* plens, const int* pstrides, int size);

TensorDescriptor(miopenDataType_t t,
std::vector<std::size_t> lens_in,
std::vector<std::size_t> strides_in);

template <class Range>
TensorDescriptor(miopenDataType_t t, const Range& plens)
: lens(plens.begin(), plens.end()), packed(true), type(t)
Expand Down
101 changes: 101 additions & 0 deletions src/include/miopen/tensor_ops.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -31,10 +31,111 @@
#include <miopen/handle.hpp>
#include <miopen/miopen.h>
#include <miopen/object.hpp>
#include <miopen/functional.hpp>
#include <vector>
#include <boost/range/combine.hpp>
#include <boost/range/adaptor/filtered.hpp>

namespace miopen {

struct f_length_is_not_1_t
{
template <typename T>
bool operator()(T&& v)
{
return boost::get<0>(v) > 1;
}
};

TensorDescriptor GetFlattenedTensorDescriptor(const TensorDescriptor& desc);

template <typename... TDescriptors>
std::tuple<TDescriptors...>
GetConsistentFlattenedTensorDescriptors(const TDescriptors&... real_descriptor_pack)
{
constexpr std::size_t NTensor = sizeof...(TDescriptors);
std::integral_constant<std::size_t, NTensor> NTensorConstant;

std::array<const TensorDescriptor*, NTensor> real_descriptors{{(&real_descriptor_pack)...}};

#ifndef NDEBUG
// sanity check: all input TensorDescriptors should have the same GetLengths()
const auto& real_desc_0_lens = real_descriptors[0]->GetLengths();

for(std::size_t itensor = 1; itensor < NTensor; ++itensor)
{
if(real_desc_0_lens != real_descriptors[itensor]->GetLengths())
MIOPEN_THROW(miopenStatusBadParm, "Lengths of Tensors are different.");
}
#endif

// start flattening tensors
std::array<std::vector<std::size_t>, NTensor> array_of_flat_lengths;
std::array<std::vector<std::size_t>, NTensor> array_of_flat_strides;

auto non1_length_strides =
boost::combine(real_descriptors[0]->GetLengths(), real_descriptor_pack.GetStrides()...) |
boost::adaptors::filtered(f_length_is_not_1_t());

auto i = non1_length_strides.begin();
std::size_t flat_len = boost::get<0>(*i);
auto i_previous = i++;

// the 0-th dimension full-length doesn't matter
for(; i != non1_length_strides.end(); ++i)
{
std::size_t len = boost::get<0>(*i);

bool is_all_full_length = true;
repeat_n(
[&](auto itensor) {
std::size_t stride = boost::get<itensor + 1>(*i);
std::size_t previous_stride = boost::get<itensor + 1>(*i_previous);
std::size_t full_len = previous_stride / stride;
if(len != full_len)
is_all_full_length = false;
},
NTensorConstant);

if(is_all_full_length)
{
flat_len *= len;
}
else
{
array_of_flat_lengths[0].push_back(flat_len);

repeat_n(
[&](auto itensor) {
std::size_t previous_stride = boost::get<itensor + 1>(*i_previous);
array_of_flat_strides[itensor].push_back(previous_stride);
},
NTensorConstant);
flat_len = len;
}
i_previous = i;
}
// lengths of all flattend tensors are the same
array_of_flat_lengths[0].push_back(flat_len);

// strides of all flattend tensors are different
repeat_n(
[&](auto itensor) {
std::size_t previous_stride = boost::get<itensor + 1>(*i_previous);
array_of_flat_strides[itensor].push_back(previous_stride);
},
NTensorConstant);

for(std::size_t itensor = 1; itensor < NTensor; ++itensor)
array_of_flat_lengths[itensor] = array_of_flat_lengths[0];

return create_tuple<NTensor>([&](auto itensor) {
return TensorDescriptor{real_descriptors[itensor]->GetType(),
std::move(array_of_flat_lengths[itensor]),
std::move(array_of_flat_strides[itensor])};
});
}

void ScaleTensor(
Handle& handle, const TensorDescriptor& yDesc, Data_t y, const void* alpha, int offset = 0);

Expand Down
32 changes: 4 additions & 28 deletions src/ocl/convolutionocl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -330,14 +330,6 @@ void ConvolutionDescriptor::FindConvFwdAlgorithm(Handle& handle,

std::tie(std::ignore, std::ignore, wei_h, wei_w) = tien<4>(wDesc.GetLengths());

/// \todo add support for non-zero padding in 1x1conv
if((wei_h == 1 && wei_w == 1) && (pad_h > 0 || pad_w > 0))
{
MIOPEN_THROW(miopenStatusBadParm,
"Invalid config. MIOPEN expects padding "
"== 0 when filter size == 1");
}

int out_h, out_w;
std::tie(std::ignore, std::ignore, out_h, out_w) = tien<4>(yDesc.GetLengths());

Expand Down Expand Up @@ -1050,7 +1042,7 @@ void ConvolutionDescriptor::ConvolutionForward(Handle& handle,
for(int i = 0; i < in_n; i++)
{
int out_offset = i * wei_n * out_h * out_w;
if(wei_h != 1 || wei_w != 1 || v != 1 || u != 1)
if(wei_h != 1 || wei_w != 1 || v != 1 || u != 1 || pad_h != 0 || pad_w != 0)
{
size_t in_offset = i * in_c * in_h * in_w;
Im2ColGPU(handle,
Expand Down Expand Up @@ -1410,14 +1402,6 @@ void ConvolutionDescriptor::FindConvBwdDataAlgorithm(Handle& handle,

std::tie(std::ignore, std::ignore, wei_h, wei_w) = tien<4>(wDesc.GetLengths());

/// \todo add support for non-zero padding in 1x1conv
if((wei_h == 1 && wei_w == 1) && (pad_h > 0 || pad_w > 0))
{
MIOPEN_THROW(miopenStatusBadParm,
"Invalid config. MIOPEN expects padding "
"== 0 when filter size == 1");
}

int out_h, out_w;
std::tie(std::ignore, std::ignore, out_h, out_w) = tien<4>(dyDesc.GetLengths());

Expand Down Expand Up @@ -2176,7 +2160,7 @@ void ConvolutionDescriptor::ConvolutionBackwardData(Handle& handle,
{
int out_offset = i * wei_n * out_h * out_w;

if(wei_h != 1 || wei_w != 1 || v != 1 || u != 1)
if(wei_h != 1 || wei_w != 1 || v != 1 || u != 1 || pad_h != 0 || pad_w != 0)
{
size_t in_offset = i * in_c * in_h * in_w;

Expand Down Expand Up @@ -2625,14 +2609,6 @@ void ConvolutionDescriptor::FindConvBwdWeightsAlgorithm(Handle& handle,

std::tie(std::ignore, std::ignore, wei_h, wei_w) = tien<4>(dwDesc.GetLengths());

/// \todo add support for non-zero padding in 1x1conv
if((wei_h == 1 && wei_w == 1) && (pad_h > 0 || pad_w > 0))
{
MIOPEN_THROW(miopenStatusBadParm,
"Invalid config. MIOPEN expects padding "
"== 0 when filter size == 1");
}

int out_h, out_w;
std::tie(std::ignore, std::ignore, out_h, out_w) = tien<4>(dyDesc.GetLengths());

Expand Down Expand Up @@ -2963,7 +2939,7 @@ void ConvolutionDescriptor::ConvolutionBackwardWeights(Handle& handle,

std::string network_config;

if(wei_h != 1 || wei_w != 1 || v != 1 || u != 1)
if(wei_h != 1 || wei_w != 1 || v != 1 || u != 1 || pad_h != 0 || pad_w != 0)
{
assert(workSpace != nullptr &&
workSpaceSize >=
Expand All @@ -2980,7 +2956,7 @@ void ConvolutionDescriptor::ConvolutionBackwardWeights(Handle& handle,
for(int i = 0; i < in_n; i++)
{
int out_offset = i * wei_n * out_h * out_w;
if(wei_h != 1 || wei_w != 1 || v != 1 || u != 1)
if(wei_h != 1 || wei_w != 1 || v != 1 || u != 1 || pad_h != 0 || pad_w != 0)
{
size_t in_offset = i * in_c * in_h * in_w;
Im2ColGPU(handle,
Expand Down
Loading

0 comments on commit db83eda

Please sign in to comment.