[PHI] traspose2 kernel migration (#47748)

* traspose2 kernel migrated * Got rid of mutable_data * x modification added * ops added in extra info file * Formatting fix * 2 fuse passes with tanpose2 commented * nr of outs changed in 2 passes, passes uncommented * Changes in passes reverted * transpose chnaged in operator.cc * MKLDNN check in operator.cc * Transpose fixes * Fix deleted from operato * template corrected Co-authored-by: Paulina Gacek <[email protected]>
PaddlePaddle · Nov 29, 2022 · d86aa4c · d86aa4c
1 parent 91dd8a2
commit d86aa4c
Show file tree

Hide file tree

Showing 6 changed files with 226 additions and 61 deletions.
diff --git a/paddle/fluid/operators/mkldnn/transpose_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/transpose_mkldnn_op.cc
@@ -42,9 +42,6 @@ class TransposeMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
 
     auto& astream = OneDNNContext::tls().get_stream();
 
-    platform::SetInMemDescWithLogicalLayoutFusesSupport(
-        ctx, const_cast<phi::DenseTensor*>(x), x->mem_desc());
-
     if (ndims == 1) {
       framework::TensorCopy(*x, x->place(), out);
       out->set_mem_desc(x->mem_desc());
@@ -82,11 +79,8 @@ class TransposeMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
     reorder_p->execute(astream, *reorder_src_memory_p, *reorder_dst_memory_p);
     astream.wait();
 
-    platform::SetOutMemDescWithLogicalLayoutFusesSupport(
-        ctx,
-        out,
-        reorder_dst_memory_p->get_desc().permute_axes(
-            TransposeToPermuteAxis(transpose_axis)));
+    out->set_mem_desc(reorder_dst_memory_p->get_desc().permute_axes(
+        TransposeToPermuteAxis(transpose_axis)));
   }
 
  private:
@@ -180,11 +174,3 @@ REGISTER_OP_KERNEL(transpose_grad,
                    MKLDNN,
                    ::paddle::platform::CPUPlace,
                    ops::TransposeMKLDNNGradOpKernel<float>);
-
-REGISTER_OP_KERNEL(transpose2,
-                   MKLDNN,
-                   ::paddle::platform::CPUPlace,
-                   ops::TransposeMKLDNNOpKernel<float>,
-                   ops::TransposeMKLDNNOpKernel<uint8_t>,
-                   ops::TransposeMKLDNNOpKernel<int8_t>,
-                   ops::TransposeMKLDNNOpKernel<paddle::platform::bfloat16>);
diff --git a/paddle/fluid/operators/ops_extra_info.h b/paddle/fluid/operators/ops_extra_info.h
@@ -120,6 +120,9 @@ const std::unordered_map<std::string, ExtraAttrPropertySet>
         {"Scale_weights", ExtraAttrProperty::ONEDNN},
         {"x_data_format", ExtraAttrProperty::ONEDNN},
         {"y_data_format", ExtraAttrProperty::ONEDNN},
+        {"fused_squeeze2_axes", ExtraAttrProperty::ONEDNN},
+        {"fused_unsqueeze2_axes", ExtraAttrProperty::ONEDNN},
+        {"fused_reshape2_shape", ExtraAttrProperty::ONEDNN},
         // ONEDNN pass dedicated attributes
         {"Activation_scale", ExtraAttrProperty::ONEDNN},
         {"Bias_scales", ExtraAttrProperty::ONEDNN},

diff --git a/paddle/fluid/platform/mkldnn_reuse.h b/paddle/fluid/platform/mkldnn_reuse.h
@@ -151,50 +151,6 @@ static void SetOutMemDescWithLogicalLayoutFusesSupport(
   }
 }
 
-static void SetInMemDescWithSqueeze2FuseSupport(
-    const framework::ExecutionContext& ctx,
-    phi::DenseTensor* in,
-    const dnnl::memory::desc& in_md) {
-  const std::vector<int> fused_squeeze2_axes =
-      ctx.Attr<std::vector<int>>("fused_squeeze2_axes");
-  const std::set<int64_t> squeeze2_axes_set(fused_squeeze2_axes.begin(),
-                                            fused_squeeze2_axes.end());
-  const std::vector<int64_t>& x_vec_dims = in_md.dims();
-  std::vector<int64_t> squeezed_op_tz(
-      x_vec_dims.size() - fused_squeeze2_axes.size(), 0);
-
-  int j = 0;
-  for (size_t i = 0; i < x_vec_dims.size(); ++i) {
-    if (squeeze2_axes_set.count(i) ||
-        squeeze2_axes_set.count(i - x_vec_dims.size())) {
-      PADDLE_ENFORCE_EQ(
-          x_vec_dims[i],
-          1,
-          platform::errors::InvalidArgument(
-              "Squeeze2 input dim %d should be equal to one, but get %d.",
-              i,
-              x_vec_dims[i]));
-      continue;
-    }
-    squeezed_op_tz[j++] = x_vec_dims[i];
-  }
-
-  in->set_mem_desc(in_md.reshape(squeezed_op_tz));
-  in->Resize(phi::make_ddim(squeezed_op_tz));
-}
-
-static void SetInMemDescWithLogicalLayoutFusesSupport(
-    const framework::ExecutionContext& ctx,
-    phi::DenseTensor* in,
-    const dnnl::memory::desc& in_md) {
-  if (ctx.HasAttr("fused_squeeze2_axes")) {
-    SetInMemDescWithSqueeze2FuseSupport(ctx, in, in_md);
-  } else {
-    in->set_mem_desc(in_md);
-    in->Resize(phi::make_ddim(in_md.dims()));
-  }
-}
-
 template <typename XT, typename YT, typename OT>
 class MatMulV2MKLDNNHandler
     : public phi::funcs::OneDNNHandlerNoCachingT<XT, dnnl::matmul> {

diff --git a/paddle/phi/backends/onednn/onednn_reuse.h b/paddle/phi/backends/onednn/onednn_reuse.h
@@ -15,6 +15,7 @@ limitations under the License. */
 
 #include <algorithm>
 #include <memory>
+#include <set>
 #include <sstream>
 #include <string>
 #include <utility>
@@ -1660,6 +1661,85 @@ class PoolingOneDNNHandler
   }
 };
 
+static void SetOutMemDescWithUnsqueeze2FuseSupport(
+    const std::vector<int> fused_unsqueeze2_axes,
+    phi::DenseTensor* out,
+    const dnnl::memory::desc& out_md) {
+  const std::vector<int64_t>& op_tz = out_md.dims();
+  std::vector<int64_t> unsqueezed_op_tz(
+      op_tz.size() + fused_unsqueeze2_axes.size(), 0);
+
+  for (const auto& axis : fused_unsqueeze2_axes) {
+    int positive_axis = axis < 0 ? unsqueezed_op_tz.size() + axis : axis;
+    unsqueezed_op_tz[positive_axis] = 1;
+  }
+
+  int j = 0;
+  for (size_t i = 0; i < unsqueezed_op_tz.size(); ++i) {
+    if (unsqueezed_op_tz[i] == 0) {
+      unsqueezed_op_tz[i] = op_tz[j++];
+    }
+  }
+  out->set_mem_desc(out_md.reshape(unsqueezed_op_tz));
+  out->Resize(make_ddim(unsqueezed_op_tz));
+}
+
+static void SetOutMemDescWithReshape2FuseSupport(
+    const std::vector<int> fused_reshape2_shape_,
+    phi::DenseTensor* out,
+    const dnnl::memory::desc& out_md) {
+  std::vector<int64_t> fused_reshape2_shape(fused_reshape2_shape_.begin(),
+                                            fused_reshape2_shape_.end());
+
+  const int out_shape_numel = out->numel();
+  const int new_shape_numel = std::accumulate(fused_reshape2_shape.begin(),
+                                              fused_reshape2_shape.end(),
+                                              1,
+                                              std::multiplies<int64_t>());
+
+  for (size_t i = 0; i < fused_reshape2_shape.size(); ++i) {
+    if (fused_reshape2_shape[i] == -1) {
+      fused_reshape2_shape[i] = -out_shape_numel / new_shape_numel;
+      break;
+    }
+  }
+
+  out->set_mem_desc(out_md.reshape(fused_reshape2_shape));
+  out->Resize(phi::make_ddim(fused_reshape2_shape));
+}
+
+static void SetOutMemDescWithLogicalLayoutFusesSupport(
+    const OneDNNContext& dev_ctx,
+    phi::DenseTensor* out,
+    const dnnl::memory::desc& out_md) {
+  const auto fused_unsqueeze2_axes =
+      dev_ctx.HasDnnAttr("fused_unsqueeze2_axes")
+          ? PADDLE_GET_CONST(std::vector<int>,
+                             dev_ctx.GetDnnAttr("fused_unsqueeze2_axes"))
+          : std::vector<int>();
+  const auto fused_reshape2_shape =
+      dev_ctx.HasDnnAttr("fused_reshape2_shape")
+          ? PADDLE_GET_CONST(std::vector<int>,
+                             dev_ctx.GetDnnAttr("fused_reshape2_shape"))
+          : std::vector<int>();
+  const auto fused_squeeze2_axes =
+      dev_ctx.HasDnnAttr("fused_squeeze2_axes")
+          ? PADDLE_GET_CONST(std::vector<int>,
+                             dev_ctx.GetDnnAttr("fused_squeeze2_axes"))
+          : std::vector<int>();
+
+  if (!fused_unsqueeze2_axes.empty()) {
+    SetOutMemDescWithUnsqueeze2FuseSupport(fused_unsqueeze2_axes, out, out_md);
+  } else if (!fused_reshape2_shape.empty()) {
+    SetOutMemDescWithReshape2FuseSupport(fused_reshape2_shape, out, out_md);
+  } else if (!fused_squeeze2_axes.empty()) {
+    out->set_mem_desc(out_md);
+    out->Resize(make_ddim(out_md.dims()));
+  } else {
+    out->set_mem_desc(out_md);
+  }
+}
+
 static DDim RowMatrixDimsFromVector(const DDim& x_dim) {
   return x_dim.size() > 1 ? x_dim : make_ddim({1, x_dim[0]});
 }

diff --git a/paddle/phi/kernels/onednn/transpose_grad_kernel.cc b/paddle/phi/kernels/onednn/transpose_grad_kernel.cc
@@ -63,4 +63,4 @@ void TransposeGradKernel(const Context& dev_ctx,
 }  // namespace phi
 
 PD_REGISTER_KERNEL(
-    transpose_grad, OneDNN, ALL_LAYOUT, phi::TransposeGradKernel, float) {}
+    transpose_grad, OneDNN, ONEDNN, phi::TransposeGradKernel, float) {}
diff --git a/paddle/phi/kernels/onednn/transpose_kernel.cc b/paddle/phi/kernels/onednn/transpose_kernel.cc
@@ -0,0 +1,140 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/transpose_kernel.h"
+#include "paddle/fluid/framework/tensor_util.h"
+#include "paddle/phi/backends/onednn/onednn_reuse.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+namespace phi {
+
+void SetInMemDescWithSqueeze2FuseSupport(
+    const std::vector<int> fused_squeeze2_axes,
+    DenseTensor* in,
+    const dnnl::memory::desc& in_md) {
+  const std::set<int64_t> squeeze2_axes_set(fused_squeeze2_axes.begin(),
+                                            fused_squeeze2_axes.end());
+  const std::vector<int64_t>& x_vec_dims = in_md.dims();
+  std::vector<int64_t> squeezed_op_tz(
+      x_vec_dims.size() - fused_squeeze2_axes.size(), 0);
+
+  int j = 0;
+  for (size_t i = 0; i < x_vec_dims.size(); ++i) {
+    if (squeeze2_axes_set.count(i) ||
+        squeeze2_axes_set.count(i - x_vec_dims.size())) {
+      PADDLE_ENFORCE_EQ(
+          x_vec_dims[i],
+          1,
+          errors::InvalidArgument(
+              "Squeeze2 input dim %d should be equal to one, but get %d.",
+              i,
+              x_vec_dims[i]));
+      continue;
+    }
+    squeezed_op_tz[j++] = x_vec_dims[i];
+  }
+
+  in->set_mem_desc(in_md.reshape(squeezed_op_tz));
+  in->Resize(make_ddim(squeezed_op_tz));
+}
+
+void SetInMemDescWithLogicalLayoutFusesSupport(
+    const OneDNNContext& dev_ctx,
+    DenseTensor* in,
+    const dnnl::memory::desc& in_md) {
+  const auto fused_squeeze2_axes =
+      dev_ctx.HasDnnAttr("fused_squeeze2_axes")
+          ? PADDLE_GET_CONST(std::vector<int>,
+                             dev_ctx.GetDnnAttr("fused_squeeze2_axes"))
+          : std::vector<int>();
+  if (fused_squeeze2_axes.empty()) {
+    in->set_mem_desc(in_md);
+    in->Resize(make_ddim(in_md.dims()));
+  } else {
+    SetInMemDescWithSqueeze2FuseSupport(fused_squeeze2_axes, in, in_md);
+  }
+}
+
+template <typename T, typename Context>
+void TransposeKernel(const Context& dev_ctx,
+                     const DenseTensor& x,
+                     const std::vector<int>& axis,
+                     DenseTensor* out) {
+  PADDLE_ENFORCE_EQ(
+      dev_ctx.GetPlace().GetType() == AllocationType::CPU,
+      true,
+      errors::PreconditionNotMet("oneDNN Transpose kernel must use CPUPlace"));
+
+  SetInMemDescWithLogicalLayoutFusesSupport(
+      dev_ctx, const_cast<DenseTensor*>(&x), x.mem_desc());
+
+  if (axis.size() == 1) {
+    paddle::framework::TensorCopy(x, x.place(), out);
+    out->set_mem_desc(x.mem_desc());
+    return;
+  }
+
+  auto x_vec_dims = vectorize(x.dims());
+  auto x_type = funcs::ToOneDNNDataType(x.dtype());
+  funcs::ReorderOneDNNHandler reorder_handler(
+      x_vec_dims, x.dtype(), x_type, dev_ctx.GetEngine());
+  auto reorder_src_memory_p = reorder_handler.AcquireSrcMemory(
+      x.mem_desc(), funcs::to_void_cast(x.data<T>()));
+  auto dst_md =
+      dnnl::memory::desc(x_vec_dims,
+                         x.mem_desc().data_type(),
+                         funcs::GetPlainOneDNNFormat(x_vec_dims.size()));
+
+  // a trick is used here to fake transpose of out_md, so later it will be
+  // "untransposed", leaving output data in plain format tag
+  std::vector<int64_t> fake_strides(axis.size());
+  auto dims = dst_md.dims();
+  int total_stride = 1;
+  for (int i = static_cast<int>(dims.size()) - 1; i >= 0; --i) {
+    fake_strides[axis[i]] = total_stride;
+    total_stride *= dims[axis[i]];
+  }
+  dst_md =
+      dnnl::memory::desc(x_vec_dims, x.mem_desc().data_type(), fake_strides);
+  auto dst_data = dev_ctx.template Alloc<T>(out);
+  auto reorder_dst_memory_p =
+      std::make_shared<dnnl::memory>(dst_md, dev_ctx.GetEngine(), dst_data);
+  auto reorder_p = reorder_handler.AcquireReorder(reorder_dst_memory_p,
+                                                  reorder_src_memory_p);
+
+  auto& astream = OneDNNContext::tls().get_stream();
+  reorder_p->execute(astream, *reorder_src_memory_p, *reorder_dst_memory_p);
+  astream.wait();
+
+  // it is needed because oneDNN's permute axis understand axes order in
+  // different way PaddlePaddle's transpose
+  std::vector<int> permute_axis(axis.size());
+  for (size_t i = 0; i < axis.size(); ++i) {
+    permute_axis[axis[i]] = i;
+  }
+  funcs::SetOutMemDescWithLogicalLayoutFusesSupport(
+      dev_ctx,
+      out,
+      reorder_dst_memory_p->get_desc().permute_axes(permute_axis));
+}
+}  // namespace phi
+
+PD_REGISTER_KERNEL(transpose,
+                   OneDNN,
+                   ONEDNN,
+                   phi::TransposeKernel,
+                   float,
+                   uint8_t,
+                   int8_t,
+                   phi::dtype::bfloat16) {}