[API] NCHWc tensor layout support (#1532)

ROCm · May 27, 2022 · 93d1476 · 93d1476
1 parent f3b4f18
commit 93d1476
Show file tree

Hide file tree

Showing 24 changed files with 920 additions and 108 deletions.
diff --git a/driver/conv_driver.hpp b/driver/conv_driver.hpp
@@ -205,6 +205,7 @@ class ConvDriver : public Driver
     // defined in MIOpen lib.
     // layout_type - input value supplied with MIOpen driver command.
     void ValidateLayoutInputParameters(std::string layout_type);
+    void ValidateVectorizedParameters(int vector_dim, int vector_length);
 
     // Helper function to check the Layout type short names
     // Short names are defined as I,O,f. W.r.t In/Out/fil layout
@@ -456,6 +457,7 @@ int ConvDriver<Tgpu, Tref>::ParseCmdLineArgs(int argc, char* argv[])
     {
         std::string in_layoutValue = inflags.GetValueStr("in_layout");
         ValidateLayoutInputParameters(in_layoutValue);
+        inflags.SetValue("in_layout", in_layoutValue);
     }
     // fil layout argument value check
     if(inflags.GetValueStr("fil_layout").empty())
@@ -466,6 +468,7 @@ int ConvDriver<Tgpu, Tref>::ParseCmdLineArgs(int argc, char* argv[])
     {
         std::string fil_layoutValue = inflags.GetValueStr("fil_layout");
         ValidateLayoutInputParameters(fil_layoutValue);
+        inflags.SetValue("fil_layout", fil_layoutValue);
     }
     // out layout argument check
     if(inflags.GetValueStr("out_layout").empty())
@@ -476,7 +479,24 @@ int ConvDriver<Tgpu, Tref>::ParseCmdLineArgs(int argc, char* argv[])
     {
         std::string out_layoutValue = inflags.GetValueStr("out_layout");
         ValidateLayoutInputParameters(out_layoutValue);
+        inflags.SetValue("out_layout", out_layoutValue);
     }
+
+    // vectorized tensor Dimension & Length check
+    int vector_dim    = inflags.GetValueInt("tensor_vect");
+    int vector_length = inflags.GetValueInt("vector_length");
+
+    ValidateVectorizedParameters(vector_dim, vector_length);
+    if(vector_length != 1 && vector_dim == 1)
+    {
+        inflags.SetValue("in_layout",
+                         inflags.GetValueStr("in_layout") + "c" + std::to_string(vector_length));
+        inflags.SetValue("fil_layout",
+                         inflags.GetValueStr("fil_layout") + "c" + std::to_string(vector_length));
+        inflags.SetValue("out_layout",
+                         inflags.GetValueStr("out_layout") + "c" + std::to_string(vector_length));
+    }
+
     num_iterations = inflags.GetValueInt("iter");
     if(num_iterations < 1)
     {
@@ -538,7 +558,8 @@ void ConvDriver<Tgpu, Tref>::ValidateLayoutInputParameters(std::string layout_va
     else
     {
         if((layout_value.compare("NCHW") == 0) || (layout_value.compare("NHWC") == 0) ||
-           (layout_value.compare("NCDHW") == 0) || (layout_value.compare("NDHWC") == 0))
+           (layout_value.compare("CHWN") == 0) || (layout_value.compare("NCDHW") == 0) ||
+           (layout_value.compare("NDHWC") == 0))
         {
             // do nothing,Values are matching as defined in Lib.
         }
@@ -550,6 +571,22 @@ void ConvDriver<Tgpu, Tref>::ValidateLayoutInputParameters(std::string layout_va
     }
 }
 
+template <typename Tgpu, typename Tref>
+void ConvDriver<Tgpu, Tref>::ValidateVectorizedParameters(int vector_dim, int vector_length)
+{
+    if(((vector_length == 4 || vector_length == 8) && vector_dim == 1) ||
+       (vector_length == 0 && vector_dim == 0))
+    {
+        // do nothing,Values are matching as defined in Lib.
+    }
+    else
+    {
+        std::cerr << "Invalid Tensor Vectorization Parameter Value - "
+                  << "vector_dim:" << vector_dim << "vector_length:" << vector_length << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
 template <typename Tgpu, typename Tref>
 int ConvDriver<Tgpu, Tref>::ChkLayout_ShortName()
 {
@@ -594,7 +631,16 @@ int ConvDriver<Tgpu, Tref>::GetandSetData()
     SetConvDescriptorFromCmdLineArgs();
 
     std::vector<int> out_len = GetOutputTensorLengths();
-
+    if(miopen::deref(inputTensor).GetLayout_t() == miopenTensorNCHWc4 ||
+       miopen::deref(inputTensor).GetLayout_t() == miopenTensorNCHWc8)
+    {
+        out_len[1] *= miopen::deref(inputTensor).GetVectorLength();
+    }
+    if(miopen::deref(inputTensor).GetLayout_t() == miopenTensorCHWNc4 ||
+       miopen::deref(inputTensor).GetLayout_t() == miopenTensorCHWNc8)
+    {
+        out_len[0] *= miopen::deref(inputTensor).GetVectorLength();
+    }
     miopenDataType_t y_type =
         (data_type == miopenInt8 || data_type == miopenInt8x4) ? miopenInt32 : data_type;
     SetTensorNd(outputTensor, out_len, inflags.GetValueStr("out_layout"), y_type);
@@ -737,6 +783,8 @@ int ConvDriver<Tgpu, Tref>::AddCmdLineArgs()
                          "0",
                          "tensor vectorization type (none, vect_c, vect_n) (Default=0)",
                          "int");
+    inflags.AddInputFlag(
+        "vector_length", 'L', "1", "tensor vectorization length (Default=1)", "int");
     inflags.AddInputFlag("dilation_d", '^', "1", "Dilation of Filter Depth (Default=1)", "int");
     inflags.AddInputFlag("dilation_h", 'l', "1", "Dilation of Filter Height (Default=1)", "int");
     inflags.AddInputFlag("dilation_w", 'j', "1", "Dilation of Filter Width (Default=1)", "int");
@@ -1203,17 +1251,13 @@ int ConvDriver<Tgpu, Tref>::AllocateBuffersAndCopy()
     }
 
     if(is_fwd || is_wrw)
-        in = tensor<Tgpu>(miopen::deref(inputTensor).GetLengths(),
-                          miopen::deref(inputTensor).GetStrides());
+        in = tensor<Tgpu>(miopen::deref(inputTensor));
     if(is_fwd || is_bwd)
-        wei = tensor<Tgpu>(miopen::deref(weightTensor).GetLengths(),
-                           miopen::deref(weightTensor).GetStrides());
+        wei = tensor<Tgpu>(miopen::deref(weightTensor));
     if(is_fwd)
-        out = tensor<Tgpu>(miopen::deref(outputTensor).GetLengths(),
-                           miopen::deref(outputTensor).GetStrides());
+        out = tensor<Tgpu>(miopen::deref(outputTensor));
     if(is_bwd || is_wrw)
-        dout = tensor<Tgpu>(miopen::deref(outputTensor).GetLengths(),
-                            miopen::deref(outputTensor).GetStrides());
+        dout = tensor<Tgpu>(miopen::deref(outputTensor));
 
     if(is_bwd)
         din = std::vector<Tgpu>(in_sz, static_cast<Tgpu>(0));
@@ -1229,12 +1273,9 @@ int ConvDriver<Tgpu, Tref>::AllocateBuffersAndCopy()
             new GPUMem(ctx, GetTensorSize(weightTensor_vect4), sizeof(Tgpu)));
     }
 
-    outhost   = tensor<Tref>(miopen::deref(outputTensor).GetLengths(),
-                           miopen::deref(outputTensor).GetStrides());
-    din_host  = tensor<Tref>(miopen::deref(inputTensor).GetLengths(),
-                            miopen::deref(inputTensor).GetStrides());
-    dwei_host = tensor<Tref>(miopen::deref(weightTensor).GetLengths(),
-                             miopen::deref(weightTensor).GetStrides());
+    outhost   = tensor<Tref>(miopen::deref(outputTensor));
+    din_host  = tensor<Tref>(miopen::deref(inputTensor));
+    dwei_host = tensor<Tref>(miopen::deref(weightTensor));
 
     std::string inFileName   = inflags.GetValueStr("in_data");
     std::string weiFileName  = inflags.GetValueStr("weights");
@@ -1341,11 +1382,9 @@ int ConvDriver<Tgpu, Tref>::AllocateBuffersAndCopy()
             size_t b_sz = GetTensorSize(biasTensor);
             b_dev       = std::unique_ptr<GPUMem>(new GPUMem(ctx, b_sz, sizeof(Tgpu)));
             db_dev      = std::unique_ptr<GPUMem>(new GPUMem(ctx, b_sz, sizeof(Tgpu)));
-            b           = tensor<Tgpu>(miopen::deref(biasTensor).GetLengths(),
-                             miopen::deref(biasTensor).GetStrides());
+            b           = tensor<Tgpu>(miopen::deref(biasTensor));
             db          = std::vector<Tgpu>(b_sz, static_cast<Tgpu>(0));
-            db_host     = tensor<Tref>(miopen::deref(biasTensor).GetLengths(),
-                                   miopen::deref(biasTensor).GetStrides());
+            db_host     = tensor<Tref>(miopen::deref(biasTensor));
             for(int i = 0; i < b_sz; i++)
             {
                 b.data[i] = static_cast<Tgpu>(i % 8) +
@@ -2147,8 +2186,7 @@ int ConvDriver<Tgpu, Tref>::RunForwardGPUReference()
         out_dev->FromGPU(GetStream(), outhost.data.data());
     else
     {
-        auto out_tmp = tensor<Tgpu>(miopen::deref(outputTensor).GetLengths(),
-                                    miopen::deref(outputTensor).GetStrides());
+        auto out_tmp = tensor<Tgpu>(miopen::deref(outputTensor));
         out_dev->FromGPU(GetStream(), out_tmp.data.data());
         for(int i = 0; i < out_tmp.data.size(); i++)
         {
@@ -3064,8 +3102,7 @@ int ConvDriver<Tgpu, Tref>::RunBackwardWeightsGPUReference()
         dwei_dev->FromGPU(GetStream(), dwei_host.data.data());
     else
     {
-        auto dwei_tmp = tensor<Tgpu>(miopen::deref(weightTensor).GetLengths(),
-                                     miopen::deref(weightTensor).GetStrides());
+        auto dwei_tmp = tensor<Tgpu>(miopen::deref(weightTensor));
         dwei_dev->FromGPU(GetStream(), dwei_tmp.data.data());
         for(int i = 0; i < dwei_tmp.data.size(); i++)
         {
@@ -3113,8 +3150,7 @@ int ConvDriver<Tgpu, Tref>::RunBackwardDataGPUReference()
         din_dev->FromGPU(GetStream(), din_host.data.data());
     else
     {
-        auto din_tmp = tensor<Tgpu>(miopen::deref(inputTensor).GetLengths(),
-                                    miopen::deref(inputTensor).GetStrides());
+        auto din_tmp = tensor<Tgpu>(miopen::deref(inputTensor).GetType());
         din_dev->FromGPU(GetStream(), din_tmp.data.data());
         for(int i = 0; i < din_tmp.data.size(); i++)
         {

diff --git a/driver/driver.hpp b/driver/driver.hpp
@@ -139,10 +139,10 @@ inline void PadBufferSize(size_t& sz, int datatype_sz)
 [[gnu::noreturn]] inline void Usage()
 {
     printf("Usage: ./driver *base_arg* *other_args*\n");
-    printf(
-        "Supported Base Arguments: conv[fp16|int8|bfp16], CBAInfer[fp16], pool[fp16], lrn[fp16], "
-        "activ[fp16], softmax[fp16], bnorm[fp16], rnn[fp16], gemm, ctc, dropout[fp16], "
-        "tensorop[fp16], reduce[fp16,fp64]\n");
+    printf("Supported Base Arguments: conv[fp16|fp16x4|fp16x8|int8|bfp16], CBAInfer[fp16], "
+           "pool[fp16], lrn[fp16], "
+           "activ[fp16], softmax[fp16], bnorm[fp16], rnn[fp16], gemm, ctc, dropout[fp16], "
+           "tensorop[fp16], reduce[fp16,fp64]\n");
     exit(0); // NOLINT (concurrency-mt-unsafe)
 }
 

diff --git a/driver/tensor_driver.hpp b/driver/tensor_driver.hpp
@@ -37,6 +37,55 @@
 #include <numeric>
 #include <vector>
 
+inline miopenTensorLayout_t StringToLayoutType(std::string layout)
+{
+    miopenTensorLayout_t default_layout = miopenTensorNCHW;
+    if(layout == "NCHWc4")
+        return miopenTensorNCHWc4;
+    else if(layout == "NCHWc8")
+        return miopenTensorNCHWc8;
+    else if(layout == "CHWNc4")
+        return miopenTensorCHWNc4;
+    else if(layout == "CHWNc8")
+        return miopenTensorCHWNc8;
+    else
+    {
+        MIOPEN_THROW("We only support NCHWc4, NCHWc8, CHWNc4, CHWNc8 vectorized tensor layout.");
+        return default_layout;
+    }
+}
+
+inline void LengthReorder(std::vector<int>& lens, const std::initializer_list<int>& indices)
+{
+    std::vector<int> out_lens;
+    out_lens.reserve(indices.size());
+    for(int index : indices)
+    {
+        assert(0 <= index && index < lens.size());
+        out_lens.push_back(std::move(lens[index]));
+    }
+    lens = std::move(out_lens);
+}
+
+inline int GetTensorVectorLength(miopenTensorDescriptor_t& tensor)
+{
+    int vectorLength;
+
+    int size = 0;
+    miopenGetTensorDescriptorSize(tensor, &size);
+
+    if(size == 4)
+    {
+        miopenGetNdTensorDescriptorVectorLength(tensor, &vectorLength);
+        return vectorLength;
+    }
+    else
+    {
+        MIOPEN_THROW("We only support 4D layout in vector format");
+    }
+    return 0;
+}
+
 inline std::vector<int> GetTensorLengths(miopenTensorDescriptor_t& tensor)
 {
     int n;
@@ -104,6 +153,27 @@ inline int SetTensor4d(miopenTensorDescriptor_t t,
     return miopenSet4dTensorDescriptor(t, data_type, UNPACK_VEC4(len));
 }
 
+inline int SetTensorNdVector(miopenTensorDescriptor_t t,
+                             std::vector<int>& len,
+                             miopenTensorLayout_t layout,
+                             miopenDataType_t data_type = miopenFloat)
+{
+    if(layout == miopenTensorNCHWc4 || layout == miopenTensorNCHWc8)
+    {
+        // Do nothing, MIOpen implicit logic that lens are in NCHW order.
+    }
+    else if(layout == miopenTensorCHWNc4 || layout == miopenTensorCHWNc8)
+    {
+        LengthReorder(len, {1, 2, 3, 0});
+    }
+    else
+    {
+        MIOPEN_THROW("We only support NCHWc4, NCHWc8, CHWNc4, CHWNc8 vectorized tensor layout.");
+        return -1;
+    }
+    return miopenSetNdTensorDescriptorWithLayout(t, data_type, layout, len.data(), len.size());
+}
+
 inline int SetTensorNd(miopenTensorDescriptor_t t,
                        std::vector<int>& len,
                        miopenDataType_t data_type = miopenFloat)
@@ -129,11 +199,16 @@ inline int SetTensorNd(miopenTensorDescriptor_t t,
         return SetTensorNd(t, len, data_type);
     }
 
-    if(layout.size() != len.size())
+    if(layout.size() != len.size() && layout.find("c") == std::string::npos)
     {
         MIOPEN_THROW("unmatched layout and dimension size");
     }
 
+    if(layout.find("c") != std::string::npos)
+    {
+        return SetTensorNdVector(t, len, StringToLayoutType(layout), data_type);
+    }
+
     // Dimension lengths vector 'len' comes with a default layout.
     std::string len_layout = miopen::tensor_layout_get_default(layout.size());
     if(len_layout.empty())
@@ -155,8 +230,9 @@ inline size_t GetTensorSize(miopenTensorDescriptor_t& tensor)
 {
     assert(miopen::deref(tensor).IsPacked() &&
            "GetTensorSize should not be used on an unpacked tensor.");
-    const auto len = GetTensorLengths(tensor);
-    size_t sz      = std::accumulate(len.begin(), len.end(), size_t{1}, std::multiplies<size_t>());
+    const auto len          = GetTensorLengths(tensor);
+    const auto vectorLength = GetTensorVectorLength(tensor);
+    size_t sz = std::accumulate(len.begin(), len.end(), vectorLength, std::multiplies<size_t>());
 
     return sz;
 }
@@ -168,4 +244,5 @@ inline size_t GetTensorSpace(miopenTensorDescriptor_t& tensor)
 {
     return miopen::deref(tensor).GetElementSpace();
 }
+
 #endif // GUARD_MIOPEN_TENSOR_DRIVER_HPP
diff --git a/include/miopen/miopen.h b/include/miopen/miopen.h
@@ -340,6 +340,24 @@ typedef enum
     miopenDouble = 6,   /*!< 64-bit floating point (Partially supported) */
 } miopenDataType_t;
 
+/*! @ingroup tensor
+ * @enum miopenTensorLayout_t
+ * Tensor layouts supported by MIOpen.
+ * miopenTensorCHWNc4 and miopenTensorCHWNc8 layout only support weight tensor.
+ */
+typedef enum
+{
+    miopenTensorNCHW   = 0, /*!< NCHW memory layout (Fully supported) */
+    miopenTensorNHWC   = 1, /*!< NHWC memory layout (Fully supported) */
+    miopenTensorCHWN   = 2, /*!< CHWN memory layout (Not supported) */
+    miopenTensorNCHWc4 = 3, /*!< NCHWc4 memory layout (Partially supported) */
+    miopenTensorNCHWc8 = 4, /*!< NCHWc8 memory layout (Partially supported) */
+    miopenTensorCHWNc4 = 5, /*!< CHWNc4 memory layout (Partially supported) */
+    miopenTensorCHWNc8 = 6, /*!< CHWNc8 memory layout (Partially supported) */
+    miopenTensorNCDHW  = 7, /*!< NCDHW memory layout (Fully supported) */
+    miopenTensorNDHWC  = 8, /*!< NCDHW memory layout (Fully supported) */
+} miopenTensorLayout_t;
+
 /*! @ingroup pooling
  * @enum miopenIndexType_t
  * MIOpen index datatypes.
@@ -580,6 +598,22 @@ MIOPEN_EXPORT miopenStatus_t miopenCreateTensorDescriptor(miopenTensorDescriptor
 MIOPEN_EXPORT miopenStatus_t miopenSet4dTensorDescriptor(
     miopenTensorDescriptor_t tensorDesc, miopenDataType_t dataType, int n, int c, int h, int w);
 
+/*! @brief Set shape of ND tensor with specific layout
+ *
+ * Interface for setting N-D tensor shape. This interface support NHWC, NCHW, NCHWc*, CHWNc*
+ * @param tensorDesc   Tensor descriptor type (output)
+ * @param dataType     MIOpen datatype (input)
+ * @param tensorLayout Tensor layout (input)
+ * @param lens         Tensor dimensions (input)
+ * @param num_lens     Tensor dimension size (input)
+ * @return             miopenStatus_t
+ */
+MIOPEN_EXPORT miopenStatus_t
+miopenSetNdTensorDescriptorWithLayout(miopenTensorDescriptor_t tensorDesc,
+                                      miopenDataType_t dataType,
+                                      miopenTensorLayout_t tensorLayout,
+                                      int* lens,
+                                      int num_lens);
 /*! @brief Set shape and stride of 4D tensor
  *
  * Interface for setting 4-D tensor shape and stride.

diff --git a/src/conv/problem_description.cpp b/src/conv/problem_description.cpp
@@ -64,7 +64,7 @@ void ProblemDescription::HeuristicUpdateLayouts()
 {
     const std::string labels = tensor_layout_get_default(in_layout.size());
 
-    static const std::vector<std::string> supported_layouts = {"NCHW", "NHWC", "NCDHW"};
+    static const std::vector<std::string> supported_layouts = {"NCHW", "NHWC", "CHWN", "NCDHW"};
     for(const std::string& layout : supported_layouts)
     {
         // Skip layouts that doesn't match dimension sizes