Merge pull request ROCm#55 from AMDComputeLibraries/develop

up to dev
synpon · Jun 25, 2018 · db83eda · db83eda
2 parents 3c586fe + 512f9f2
commit db83eda
Show file tree

Hide file tree

Showing 15 changed files with 782 additions and 484 deletions.
diff --git a/src/convolution.cpp b/src/convolution.cpp
@@ -207,7 +207,7 @@ size_t ConvolutionDescriptor::ForwardGetWorkSpaceSizeGEMM(Handle& handle,
     size_t workspace_size = wei_c * wei_h * wei_w * out_h * out_w * GetTypeSize(yDesc.GetType());
 
     // No workspace is needed for 1x1_stride=1 convolutions
-    if(wei_h == 1 && wei_w == 1 && u == 1 && v == 1)
+    if(wei_h == 1 && wei_w == 1 && u == 1 && v == 1 && pad_h == 0 && pad_w == 0)
     {
         return 0;
     }
@@ -567,7 +567,7 @@ size_t ConvolutionDescriptor::BackwardDataGetWorkSpaceSizeGEMM(Handle& handle,
     size_t gemm_size = wei_c * wei_h * wei_w * out_h * out_w * GetTypeSize(dyDesc.GetType());
 
     // No workspace is needed for 1x1_stride=1 convolutions
-    if(wei_h == 1 && wei_w == 1 && u == 1 && v == 1)
+    if(wei_h == 1 && wei_w == 1 && u == 1 && v == 1 && pad_h == 0 && pad_w == 0)
     {
         return 0;
     }

diff --git a/src/include/miopen/functional.hpp b/src/include/miopen/functional.hpp
@@ -97,6 +97,13 @@ sequence_t<F> sequence(F f)
     return {std::move(f)};
 }
 
+template <typename F, std::size_t N>
+void repeat_n(F f, std::integral_constant<std::size_t, N>)
+{
+    auto fs = [&f](auto... is) { return each_args(f, is...); };
+    sequence(fs)(std::integral_constant<std::size_t, N>{});
+}
+
 template <class T>
 struct cast_to
 {

diff --git a/src/include/miopen/solver.hpp b/src/include/miopen/solver.hpp
@@ -68,12 +68,13 @@ MIOPEN_DECLARE_ENV_VAR(MIOPEN_DEBUG_AMD_ASM_KERNELS_PERF_FILTERING)
 /// may fail (1) during exhaustive search, (2) during compilation,
 /// (3) on execution (like LDS overallocation) or (4) may reveal precision
 /// problems. These problems impedes finding and using the really fastest OpenCL solution.
-MIOPEN_DECLARE_ENV_VAR(
-    MIOPEN_OPENCL_WORKAROUND_FIND_ALL_CONV_DIRECT_FWD) /// \todo Fix & remove the workaround.
-MIOPEN_DECLARE_ENV_VAR(
-    MIOPEN_OPENCL_WORKAROUND_FIND_ALL_CONV_DIRECT_BWD) /// \todo Fix & remove the workaround.
-MIOPEN_DECLARE_ENV_VAR(
-    MIOPEN_OPENCL_WORKAROUND_FIND_ALL_CONV_DIRECT_WRW) /// \todo Fix & remove the workaround.
+
+/// \todo Remove env.var (workaround is OFF by default):
+MIOPEN_DECLARE_ENV_VAR(MIOPEN_OPENCL_WORKAROUND_FIND_ALL_CONV_DIRECT_FWD)
+/// \todo Remove env.var (workaround is OFF by default):
+MIOPEN_DECLARE_ENV_VAR(MIOPEN_OPENCL_WORKAROUND_FIND_ALL_CONV_DIRECT_BWD)
+/// \todo Remove env.var (workaround is OFF by default):
+MIOPEN_DECLARE_ENV_VAR(MIOPEN_OPENCL_WORKAROUND_FIND_ALL_CONV_DIRECT_WRW)
 
 namespace solver {
 /// \todo Move wave_size into abstraction wich represent GPU information
@@ -206,7 +207,7 @@ template <class Solver, class Context, class Db>
 auto FindSolutionImpl(rank<0>, Solver s, const Context& context, Db&)
     -> decltype(s.GetSolution(context))
 {
-    MIOPEN_LOG_I("Not searchable: " << SolverDbId(s));
+    MIOPEN_LOG_I(SolverDbId(s) << " (not searchable)");
     return s.GetSolution(context);
 }
 
@@ -304,14 +305,13 @@ std::vector<Solution> SearchForAllSolutions(const Context& search_params, Db db)
                         /// This is ok so far, as SearchForAllSolutions() is used only for direct
                         /// convolutions (for now).
                         if((search_params.direction.IsForward() &&
-                            !miopen::IsDisabled(
+                            miopen::IsEnabled(
                                 MIOPEN_OPENCL_WORKAROUND_FIND_ALL_CONV_DIRECT_FWD{})) ||
                            (search_params.direction.IsBackwardData() &&
-                            !miopen::IsDisabled(
+                            miopen::IsEnabled(
                                 MIOPEN_OPENCL_WORKAROUND_FIND_ALL_CONV_DIRECT_BWD{})) ||
                            (search_params.direction.IsBackwardWrW() &&
-                            !miopen::IsDisabled(
-                                MIOPEN_OPENCL_WORKAROUND_FIND_ALL_CONV_DIRECT_WRW{})))
+                            miopen::IsEnabled(MIOPEN_OPENCL_WORKAROUND_FIND_ALL_CONV_DIRECT_WRW{})))
                         {
                             skip_the_rest = true;
                         }
@@ -558,10 +558,6 @@ struct ConvOclDirectFwd3x3 : SolverBase<ConvolutionContext>
 struct ConvOclDirectFwdLegacyExhaustiveSearch : SolverBase<ConvolutionContext>
 {
     LegacyPerformanceConfig GetPerformanceConfig(const ConvolutionContext&) const;
-    bool IsValidPerformanceConfig(const ConvolutionContext&, const LegacyPerformanceConfig&) const
-    {
-        return true; // Do not check by default.
-    }
     LegacyPerformanceConfig Search(const ConvolutionContext&) const;
 };
 
@@ -571,13 +567,18 @@ struct ConvOclDirectFwd : ConvOclDirectFwdLegacyExhaustiveSearch
 
     ConvSolution GetSolution(const ConvolutionContext& params,
                              const LegacyPerformanceConfig& searched_params) const;
+    bool IsValidPerformanceConfig(const ConvolutionContext&, const LegacyPerformanceConfig&) const;
 };
 
 struct ConvOclDirectFwd1x1 : ConvOclDirectFwdLegacyExhaustiveSearch
 {
     bool IsApplicable(const ConvolutionContext& params) const;
     ConvSolution GetSolution(const ConvolutionContext& params,
                              const LegacyPerformanceConfig& searched_params) const;
+    bool IsValidPerformanceConfig(const ConvolutionContext&, const LegacyPerformanceConfig&) const
+    {
+        return true;
+    }
 };
 
 struct ConvBinWinograd3x3U : SolverBase<ConvolutionContext>

diff --git a/src/include/miopen/tensor.hpp b/src/include/miopen/tensor.hpp
@@ -60,6 +60,18 @@ template <std::size_t N, class T, class U>
 auto tien(T&& x, U y)
     MIOPEN_RETURNS(tie_impl(std::forward<T>(x), y, typename detail::gens<N>::type{}));
 
+template <typename F, std::size_t... Ns>
+auto create_tuple_impl(F f, detail::seq<Ns...>)
+{
+    return std::make_tuple(std::forward<decltype(f(Ns))>(f(Ns))...);
+}
+
+template <std::size_t N, typename F>
+auto create_tuple(F f)
+{
+    return create_tuple_impl(f, typename detail::gens<N>::type{});
+}
+
 inline std::size_t GetTypeSize(miopenDataType_t d)
 {
     switch(d)
@@ -79,6 +91,11 @@ struct TensorDescriptor : miopenTensorDescriptor
                      std::initializer_list<std::size_t> pstrides);
     TensorDescriptor(miopenDataType_t t, const int* plens, int size);
     TensorDescriptor(miopenDataType_t t, const int* plens, const int* pstrides, int size);
+
+    TensorDescriptor(miopenDataType_t t,
+                     std::vector<std::size_t> lens_in,
+                     std::vector<std::size_t> strides_in);
+
     template <class Range>
     TensorDescriptor(miopenDataType_t t, const Range& plens)
         : lens(plens.begin(), plens.end()), packed(true), type(t)

diff --git a/src/include/miopen/tensor_ops.hpp b/src/include/miopen/tensor_ops.hpp
@@ -31,10 +31,111 @@
 #include <miopen/handle.hpp>
 #include <miopen/miopen.h>
 #include <miopen/object.hpp>
+#include <miopen/functional.hpp>
 #include <vector>
+#include <boost/range/combine.hpp>
+#include <boost/range/adaptor/filtered.hpp>
 
 namespace miopen {
 
+struct f_length_is_not_1_t
+{
+    template <typename T>
+    bool operator()(T&& v)
+    {
+        return boost::get<0>(v) > 1;
+    }
+};
+
+TensorDescriptor GetFlattenedTensorDescriptor(const TensorDescriptor& desc);
+
+template <typename... TDescriptors>
+std::tuple<TDescriptors...>
+GetConsistentFlattenedTensorDescriptors(const TDescriptors&... real_descriptor_pack)
+{
+    constexpr std::size_t NTensor = sizeof...(TDescriptors);
+    std::integral_constant<std::size_t, NTensor> NTensorConstant;
+
+    std::array<const TensorDescriptor*, NTensor> real_descriptors{{(&real_descriptor_pack)...}};
+
+#ifndef NDEBUG
+    // sanity check: all input TensorDescriptors should have the same GetLengths()
+    const auto& real_desc_0_lens = real_descriptors[0]->GetLengths();
+
+    for(std::size_t itensor = 1; itensor < NTensor; ++itensor)
+    {
+        if(real_desc_0_lens != real_descriptors[itensor]->GetLengths())
+            MIOPEN_THROW(miopenStatusBadParm, "Lengths of Tensors are different.");
+    }
+#endif
+
+    // start flattening tensors
+    std::array<std::vector<std::size_t>, NTensor> array_of_flat_lengths;
+    std::array<std::vector<std::size_t>, NTensor> array_of_flat_strides;
+
+    auto non1_length_strides =
+        boost::combine(real_descriptors[0]->GetLengths(), real_descriptor_pack.GetStrides()...) |
+        boost::adaptors::filtered(f_length_is_not_1_t());
+
+    auto i               = non1_length_strides.begin();
+    std::size_t flat_len = boost::get<0>(*i);
+    auto i_previous      = i++;
+
+    // the 0-th dimension full-length doesn't matter
+    for(; i != non1_length_strides.end(); ++i)
+    {
+        std::size_t len = boost::get<0>(*i);
+
+        bool is_all_full_length = true;
+        repeat_n(
+            [&](auto itensor) {
+                std::size_t stride          = boost::get<itensor + 1>(*i);
+                std::size_t previous_stride = boost::get<itensor + 1>(*i_previous);
+                std::size_t full_len        = previous_stride / stride;
+                if(len != full_len)
+                    is_all_full_length = false;
+            },
+            NTensorConstant);
+
+        if(is_all_full_length)
+        {
+            flat_len *= len;
+        }
+        else
+        {
+            array_of_flat_lengths[0].push_back(flat_len);
+
+            repeat_n(
+                [&](auto itensor) {
+                    std::size_t previous_stride = boost::get<itensor + 1>(*i_previous);
+                    array_of_flat_strides[itensor].push_back(previous_stride);
+                },
+                NTensorConstant);
+            flat_len = len;
+        }
+        i_previous = i;
+    }
+    // lengths of all flattend tensors are the same
+    array_of_flat_lengths[0].push_back(flat_len);
+
+    // strides of all flattend tensors are different
+    repeat_n(
+        [&](auto itensor) {
+            std::size_t previous_stride = boost::get<itensor + 1>(*i_previous);
+            array_of_flat_strides[itensor].push_back(previous_stride);
+        },
+        NTensorConstant);
+
+    for(std::size_t itensor            = 1; itensor < NTensor; ++itensor)
+        array_of_flat_lengths[itensor] = array_of_flat_lengths[0];
+
+    return create_tuple<NTensor>([&](auto itensor) {
+        return TensorDescriptor{real_descriptors[itensor]->GetType(),
+                                std::move(array_of_flat_lengths[itensor]),
+                                std::move(array_of_flat_strides[itensor])};
+    });
+}
+
 void ScaleTensor(
     Handle& handle, const TensorDescriptor& yDesc, Data_t y, const void* alpha, int offset = 0);
 

diff --git a/src/ocl/convolutionocl.cpp b/src/ocl/convolutionocl.cpp
@@ -330,14 +330,6 @@ void ConvolutionDescriptor::FindConvFwdAlgorithm(Handle& handle,
 
     std::tie(std::ignore, std::ignore, wei_h, wei_w) = tien<4>(wDesc.GetLengths());
 
-    /// \todo add support for non-zero padding in 1x1conv
-    if((wei_h == 1 && wei_w == 1) && (pad_h > 0 || pad_w > 0))
-    {
-        MIOPEN_THROW(miopenStatusBadParm,
-                     "Invalid config. MIOPEN expects padding "
-                     "== 0 when filter size == 1");
-    }
-
     int out_h, out_w;
     std::tie(std::ignore, std::ignore, out_h, out_w) = tien<4>(yDesc.GetLengths());
 
@@ -1050,7 +1042,7 @@ void ConvolutionDescriptor::ConvolutionForward(Handle& handle,
                 for(int i = 0; i < in_n; i++)
                 {
                     int out_offset = i * wei_n * out_h * out_w;
-                    if(wei_h != 1 || wei_w != 1 || v != 1 || u != 1)
+                    if(wei_h != 1 || wei_w != 1 || v != 1 || u != 1 || pad_h != 0 || pad_w != 0)
                     {
                         size_t in_offset = i * in_c * in_h * in_w;
                         Im2ColGPU(handle,
@@ -1410,14 +1402,6 @@ void ConvolutionDescriptor::FindConvBwdDataAlgorithm(Handle& handle,
 
     std::tie(std::ignore, std::ignore, wei_h, wei_w) = tien<4>(wDesc.GetLengths());
 
-    /// \todo add support for non-zero padding in 1x1conv
-    if((wei_h == 1 && wei_w == 1) && (pad_h > 0 || pad_w > 0))
-    {
-        MIOPEN_THROW(miopenStatusBadParm,
-                     "Invalid config. MIOPEN expects padding "
-                     "== 0 when filter size == 1");
-    }
-
     int out_h, out_w;
     std::tie(std::ignore, std::ignore, out_h, out_w) = tien<4>(dyDesc.GetLengths());
 
@@ -2176,7 +2160,7 @@ void ConvolutionDescriptor::ConvolutionBackwardData(Handle& handle,
                 {
                     int out_offset = i * wei_n * out_h * out_w;
 
-                    if(wei_h != 1 || wei_w != 1 || v != 1 || u != 1)
+                    if(wei_h != 1 || wei_w != 1 || v != 1 || u != 1 || pad_h != 0 || pad_w != 0)
                     {
                         size_t in_offset = i * in_c * in_h * in_w;
 
@@ -2625,14 +2609,6 @@ void ConvolutionDescriptor::FindConvBwdWeightsAlgorithm(Handle& handle,
 
     std::tie(std::ignore, std::ignore, wei_h, wei_w) = tien<4>(dwDesc.GetLengths());
 
-    /// \todo add support for non-zero padding in 1x1conv
-    if((wei_h == 1 && wei_w == 1) && (pad_h > 0 || pad_w > 0))
-    {
-        MIOPEN_THROW(miopenStatusBadParm,
-                     "Invalid config. MIOPEN expects padding "
-                     "== 0 when filter size == 1");
-    }
-
     int out_h, out_w;
     std::tie(std::ignore, std::ignore, out_h, out_w) = tien<4>(dyDesc.GetLengths());
 
@@ -2963,7 +2939,7 @@ void ConvolutionDescriptor::ConvolutionBackwardWeights(Handle& handle,
 
             std::string network_config;
 
-            if(wei_h != 1 || wei_w != 1 || v != 1 || u != 1)
+            if(wei_h != 1 || wei_w != 1 || v != 1 || u != 1 || pad_h != 0 || pad_w != 0)
             {
                 assert(workSpace != nullptr &&
                        workSpaceSize >=
@@ -2980,7 +2956,7 @@ void ConvolutionDescriptor::ConvolutionBackwardWeights(Handle& handle,
             for(int i = 0; i < in_n; i++)
             {
                 int out_offset = i * wei_n * out_h * out_w;
-                if(wei_h != 1 || wei_w != 1 || v != 1 || u != 1)
+                if(wei_h != 1 || wei_w != 1 || v != 1 || u != 1 || pad_h != 0 || pad_w != 0)
                 {
                     size_t in_offset = i * in_c * in_h * in_w;
                     Im2ColGPU(handle,