Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into…

… normalize-op
PaddlePaddle · Aug 14, 2020 · 935fdf2 · 935fdf2
2 parents b321656 + ba574c8
commit 935fdf2
Show file tree

Hide file tree

Showing 143 changed files with 2,651 additions and 490 deletions.
diff --git a/cmake/operators.cmake b/cmake/operators.cmake
@@ -13,7 +13,7 @@ function(op_library TARGET)
     set(CUDNN_FILE)
     set(mkldnn_cc_srcs)
     set(MKLDNN_FILE)
-    set(op_common_deps operator op_registry math_function layer)
+    set(op_common_deps operator op_registry math_function layer common_infer_shape_functions)
     set(options "")
     set(oneValueArgs "")
     set(multiValueArgs SRCS DEPS)

diff --git a/paddle/fluid/framework/CMakeLists.txt b/paddle/fluid/framework/CMakeLists.txt
@@ -164,23 +164,23 @@ if(WITH_PYTHON)
   if (NOT WIN32)
     add_custom_command(TARGET framework_py_proto POST_BUILD
       COMMAND ${CMAKE_COMMAND} -E make_directory ${PADDLE_BINARY_DIR}/python/paddle/fluid/proto
-      COMMAND ${CMAKE_COMMAND} -E make_directory ${PADDLE_BINARY_DIR}/python/paddle/fleet/proto
-      COMMAND ${CMAKE_COMMAND} -E touch ${PADDLE_BINARY_DIR}/python/paddle/fleet/proto/__init__.py
+      COMMAND ${CMAKE_COMMAND} -E make_directory ${PADDLE_BINARY_DIR}/python/paddle/distributed/fleet/proto
+      COMMAND ${CMAKE_COMMAND} -E touch ${PADDLE_BINARY_DIR}/python/paddle/distributed/fleet/proto/__init__.py
       COMMAND cp *.py ${PADDLE_BINARY_DIR}/python/paddle/fluid/proto/
-      COMMAND cp distributed_strategy_*.py ${PADDLE_BINARY_DIR}/python/paddle/fleet/proto
+      COMMAND cp distributed_strategy_*.py ${PADDLE_BINARY_DIR}/python/paddle/distributed/fleet/proto
       COMMENT "Copy generated python proto into directory paddle/fluid/proto."
       WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
   else(NOT WIN32)
     string(REPLACE "/" "\\" proto_dstpath "${PADDLE_BINARY_DIR}/python/paddle/fluid/proto/")
-    string(REPLACE "/" "\\" fleet_proto_dstpath "${PADDLE_BINARY_DIR}/python/paddle/fleet/proto/")
+    string(REPLACE "/" "\\" fleet_proto_dstpath "${PADDLE_BINARY_DIR}/python/paddle/distributed/fleet/proto/")
     add_custom_command(TARGET framework_py_proto POST_BUILD
           COMMAND ${CMAKE_COMMAND} -E make_directory ${PADDLE_BINARY_DIR}/python/paddle/fluid/proto
-	  COMMAND ${CMAKE_COMMAND} -E make_directory ${PADDLE_BINARY_DIR}/python/paddle/fleet/proto
-	  COMMAND ${CMAKE_COMMAND} -E touch ${PADDLE_BINARY_DIR}/python/paddle/fleet/proto/__init__.py
+	  COMMAND ${CMAKE_COMMAND} -E make_directory ${PADDLE_BINARY_DIR}/python/paddle/distributed/fleet/proto
+	  COMMAND ${CMAKE_COMMAND} -E touch ${PADDLE_BINARY_DIR}/python/paddle/distributed/fleet/proto/__init__.py
           COMMAND copy /Y *.py ${proto_dstpath}
 	  COMMAND copy /Y distributed_strategy_*.py ${fleet_proto_dstpath}
           COMMENT "Copy generated python proto into directory paddle/fluid/proto."
-	  COMMENT "Copy generated python proto into directory paddle/fleet/proto."
+	  COMMENT "Copy generated python proto into directory paddle/distributed/fleet/proto."
           WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
   endif(NOT WIN32)
 endif()

diff --git a/paddle/fluid/framework/op_desc.cc b/paddle/fluid/framework/op_desc.cc
@@ -13,12 +13,14 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/framework/op_desc.h"
+
 #include <algorithm>
 #include <functional>
 #include <mutex>  // NOLINT
 #include <string>
 #include <unordered_map>
 #include <utility>
+
 #include "glog/logging.h"
 #include "paddle/fluid/framework/block_desc.h"
 #include "paddle/fluid/framework/op_call_stack.h"
@@ -51,6 +53,29 @@ class CompileTimeInferShapeContext : public InferShapeContext {
 
   std::vector<std::string> Outputs(const std::string &name) const override;
 
+  std::string GetInputNameByIdx(size_t idx) const override {
+    auto &op_proto =
+        paddle::framework::OpInfoMap::Instance().Get(op_.Type()).proto_;
+    PADDLE_ENFORCE_LT(idx, op_proto->inputs().size(),
+                      platform::errors::OutOfRange(
+                          "The index should be less than the size of inputs of "
+                          "operator %s, but got index is %d and size is %d",
+                          op_.Type(), idx, op_proto->inputs().size()));
+    return op_proto->inputs()[idx].name();
+  }
+
+  std::string GetOutputNameByIdx(size_t idx) const override {
+    auto &op_proto =
+        paddle::framework::OpInfoMap::Instance().Get(op_.Type()).proto_;
+    PADDLE_ENFORCE_LT(
+        idx, op_proto->outputs().size(),
+        platform::errors::OutOfRange(
+            "The index should be less than the size of outputs of "
+            "operator %s, but got index is %d and size is %d",
+            op_.Type(), idx, op_proto->outputs().size()));
+    return op_proto->outputs()[idx].name();
+  }
+
   void ShareDim(const std::string &in, const std::string &out, size_t i = 0,
                 size_t j = 0) override {
     PADDLE_ENFORCE_LT(i, Inputs(in).size());

diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc
@@ -12,6 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#include "paddle/fluid/framework/operator.h"
+
 #include <gflags/gflags.h>
 #include <glog/logging.h>
 
@@ -20,13 +22,13 @@ limitations under the License. */
 #include <string>
 #include <unordered_set>
 #include <vector>
+
 #include "paddle/fluid/framework/data_transform.h"
 #include "paddle/fluid/framework/details/nan_inf_utils.h"
 #include "paddle/fluid/framework/executor.h"
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/op_call_stack.h"
 #include "paddle/fluid/framework/op_proto_maker.h"
-#include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/framework/shape_inference.h"
 #include "paddle/fluid/framework/transfer_scope_cache.h"
 #include "paddle/fluid/framework/unused_var_check.h"
@@ -604,6 +606,29 @@ class RuntimeInferShapeContext : public InferShapeContext {
     return op_.Outputs(name);
   }
 
+  std::string GetInputNameByIdx(size_t idx) const override {
+    auto& op_proto =
+        paddle::framework::OpInfoMap::Instance().Get(op_.Type()).proto_;
+    PADDLE_ENFORCE_LT(idx, op_proto->inputs().size(),
+                      platform::errors::OutOfRange(
+                          "The index should be less than the size of inputs of "
+                          "operator %s, but got index is %d and size is %d",
+                          op_.Type(), idx, op_proto->inputs().size()));
+    return op_proto->inputs()[idx].name();
+  }
+
+  std::string GetOutputNameByIdx(size_t idx) const override {
+    auto& op_proto =
+        paddle::framework::OpInfoMap::Instance().Get(op_.Type()).proto_;
+    PADDLE_ENFORCE_LT(
+        idx, op_proto->outputs().size(),
+        platform::errors::OutOfRange(
+            "The index should be less than the size of outputs of "
+            "operator %s, but got index is %d and size is %d",
+            op_.Type(), idx, op_proto->outputs().size()));
+    return op_proto->outputs()[idx].name();
+  }
+
   void ShareDim(const std::string& in, const std::string& out, size_t i = 0,
                 size_t j = 0) override {
     auto in_it = ctx_.inputs.find(in);

diff --git a/paddle/fluid/framework/operator.h b/paddle/fluid/framework/operator.h
@@ -64,9 +64,6 @@ constexpr char kZeroVarSuffix[] = "@ZERO";
 /// Variables with this suffix are the new Gradient.
 constexpr char kNewGradSuffix[] = "@NEWGRAD@";
 
-/// Variables with this suffix are the loaded from pre-train model.
-constexpr char kLoadedVarSuffix[] = "@LOADED";
-
 /// RuntimeContext is used to relate input/output names of Operator with
 /// the corresponding variables in name scope.
 /// If an Op has attribute kEnableCacheRuntimeContext, it means that in a same

diff --git a/paddle/fluid/framework/shape_inference.h b/paddle/fluid/framework/shape_inference.h
@@ -16,6 +16,7 @@ limitations under the License. */
 
 #include <string>
 #include <vector>
+
 #include "paddle/fluid/framework/attribute.h"
 #include "paddle/fluid/framework/ddim.h"
 #include "paddle/fluid/framework/framework.pb.h"
@@ -52,7 +53,8 @@ class InferShapeContext {
                              const std::vector<DDim> &dims) = 0;
   virtual void SetReaderDims(const std::string &name,
                              const std::vector<DDim> &dims);
-
+  virtual std::string GetInputNameByIdx(size_t idx) const = 0;
+  virtual std::string GetOutputNameByIdx(size_t idx) const = 0;
   virtual AttrReader Attrs() const = 0;
   virtual std::vector<std::string> Inputs(const std::string &name) const = 0;
   virtual std::vector<std::string> Outputs(const std::string &name) const = 0;

diff --git a/paddle/fluid/imperative/CMakeLists.txt b/paddle/fluid/imperative/CMakeLists.txt
@@ -2,10 +2,10 @@ cc_library(imperative_flag SRCS flags.cc DEPS gflags)
 
 cc_library(prepared_operator SRCS prepared_operator.cc DEPS proto_desc operator device_context lod_tensor selected_rows var_type_traits op_kernel_type data_transform)
 cc_library(layer SRCS layer.cc DEPS prepared_operator math_function imperative_flag variable_helper op_registry)
-cc_library(gradient_accumulator SRCS gradient_accumulator.cc DEPS blas operator lod_tensor selected_rows selected_rows_functor var_type_traits layer math_function) 
+cc_library(gradient_accumulator SRCS gradient_accumulator.cc DEPS blas operator lod_tensor selected_rows selected_rows_functor var_type_traits layer math_function)
 add_subdirectory(jit)
-
-cc_library(tracer SRCS tracer.cc DEPS layer engine program_desc_tracer)
+cc_library(amp SRCS amp_auto_cast.cc DEPS layer )
+cc_library(tracer SRCS tracer.cc DEPS layer engine program_desc_tracer amp)
 cc_library(basic_engine SRCS basic_engine.cc DEPS layer gradient_accumulator)
 cc_library(engine SRCS basic_engine.cc partial_grad_engine.cc DEPS layer gradient_accumulator)
 cc_library(imperative_profiler SRCS profiler.cc)

diff --git a/paddle/fluid/imperative/amp_auto_cast.cc b/paddle/fluid/imperative/amp_auto_cast.cc
@@ -0,0 +1,169 @@
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/imperative/amp_auto_cast.h"
+
+#include <algorithm>
+#include <memory>
+#include <set>
+#include <string>
+#include <unordered_set>
+#include <utility>
+
+#include "paddle/fluid/imperative/layer.h"
+#include "paddle/fluid/imperative/tracer.h"
+#include "paddle/fluid/imperative/variable_wrapper.h"
+
+namespace paddle {
+namespace imperative {
+
+AmpOperators::AmpOperators()
+    : allow_ops_(new std::unordered_set<std::string>()),
+      block_ops_(new std::unordered_set<std::string>()) {}
+AmpOperators::~AmpOperators() {}
+
+AmpOperators& AmpOperators::Instance() {
+  static AmpOperators instance;
+  return instance;
+}
+
+std::shared_ptr<std::unordered_set<std::string>> AmpOperators::GetAllowOps() {
+  return allow_ops_;
+}
+
+std::shared_ptr<std::unordered_set<std::string>> AmpOperators::GetBlockOps() {
+  return block_ops_;
+}
+
+inline std::string GetDtypeStr(
+    const std::shared_ptr<imperative::VarBase>& var) {
+  return framework::DataTypeToString(var->DataType());
+}
+
+inline bool NeedCast(const std::shared_ptr<VarBase>& var) {
+  if (!platform::is_gpu_place(var->Place())) {
+    return false;
+  }
+  if (var->DataType() == framework::proto::VarType::FP32 ||
+      var->DataType() == framework::proto::VarType::FP16) {
+    return true;
+  } else {
+    return false;
+  }
+}
+
+// NOTE: Trace a cast op, so if a var is casted from fp32 to fp16, then the grad
+// var will be cast back from fp16 to fp32 during backward phase.
+static inline std::shared_ptr<imperative::VarBase> CastToType(
+    const std::shared_ptr<VarBase>& var,
+    const framework::proto::VarType::Type dst_type) {
+  const auto& tracer = imperative::GetCurrentTracer();
+  imperative::NameVarBaseMap ins = {{"X", {var}}};
+  framework::AttributeMap attrs = {{"in_dtype", var->DataType()},
+                                   {"out_dtype", dst_type}};
+  auto out = std::shared_ptr<imperative::VarBase>(
+      new imperative::VarBase(tracer->GenerateUniqueName()));
+  imperative::NameVarBaseMap outs = {{"Out", {out}}};
+
+  {
+    AutoCastGuard guard(tracer, false);
+    tracer->TraceOp("cast", ins, outs, std::move(attrs));
+  }
+
+  return out;
+}
+
+static inline std::shared_ptr<imperative::VarBase> CastToFP16(
+    const std::shared_ptr<VarBase>& var) {
+  auto dst_type = framework::proto::VarType::FP16;
+  if (NeedCast(var) && (var->DataType() != dst_type)) {
+    return CastToType(var, dst_type);
+  }
+  return var;
+}
+
+static inline std::shared_ptr<imperative::VarBase> CastToFP32(
+    const std::shared_ptr<VarBase>& var) {
+  auto dst_type = framework::proto::VarType::FP32;
+  if (NeedCast(var) && (var->DataType() != dst_type)) {
+    return CastToType(var, dst_type);
+  }
+  return var;
+}
+
+static inline framework::proto::VarType::Type GetPromoteType(
+    const NameVarBaseMap& ins) {
+  auto dst_type = framework::proto::VarType::FP16;
+  for (const auto& pair : ins) {
+    for (const auto& var : pair.second) {
+      if (var->DataType() == framework::proto::VarType::FP32) {
+        dst_type = var->DataType();
+        break;
+      }
+    }
+  }
+  return dst_type;
+}
+
+NameVarBaseMap AutoCastInputs(const std::string& op_type,
+                              const NameVarBaseMap& ins) {
+  NameVarBaseMap new_ins = {};
+  if (AmpOperators::Instance().GetAllowOps()->count(op_type)) {
+    for (const auto& pair : ins) {
+      VLOG(5) << "Op(" << op_type << "): Cast " << pair.first << " from "
+              << GetDtypeStr(*pair.second.cbegin()) << " to float16";
+      for (const auto& var : pair.second) {
+        auto new_var = CastToFP16(var);
+        new_ins[pair.first].emplace_back(new_var);
+      }
+    }
+    return new_ins;
+  } else if (AmpOperators::Instance().GetBlockOps()->count(op_type)) {
+    for (const auto& pair : ins) {
+      VLOG(5) << "Op(" << op_type << "): Cast " << pair.first << " from "
+              << GetDtypeStr(*pair.second.cbegin()) << " to float";
+      for (const auto& var : pair.second) {
+        auto new_var = CastToFP32(var);
+        new_ins[pair.first].emplace_back(new_var);
+      }
+    }
+    return new_ins;
+  } else {
+    auto dst_type = GetPromoteType(ins);
+
+    for (const auto& pair : ins) {
+      VLOG(5) << "Op(" << op_type << "): Cast " << pair.first << " from "
+              << GetDtypeStr(*pair.second.cbegin()) << " to "
+              << framework::DataTypeToString(dst_type);
+      for (const auto& var : pair.second) {
+        // NOTE(zhiqiu): Conv + BN always occur together, we needn't
+        // cast X of batch_norm to FP32, which is produced by conv as FP16 type.
+        if (op_type == "batch_norm" && pair.first == "X" &&
+            dst_type == framework::proto::VarType::FP32) {
+          new_ins[pair.first].emplace_back(var);
+          continue;
+        }
+        auto new_var = dst_type == framework::proto::VarType::FP32
+                           ? CastToFP32(var)
+                           : CastToFP16(var);
+        new_ins[pair.first].emplace_back(new_var);
+      }
+    }
+    return new_ins;
+  }
+  return ins;
+}
+
+}  // namespace imperative
+}  // namespace paddle