diff --git a/CMakeLists.txt b/CMakeLists.txt
index b1554fba5e1fa..fa87cc14f2668 100755
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -131,6 +131,7 @@ option(WITH_LITE   "Compile Paddle Fluid with Lite Engine" OFF)
 option(WITH_NCCL   "Compile PaddlePaddle with NCCL support"             ON)
 option(WITH_CRYPTO   "Compile PaddlePaddle with crypto support"         ON)
 option(WITH_ARM   "Compile PaddlePaddle with arm support"         OFF)
+option(WITH_MUSL        "Compile with musl libc instead of gblic"  OFF)
 
 # PY_VERSION
 if(NOT PY_VERSION)
diff --git a/cmake/configure.cmake b/cmake/configure.cmake
index cf458d9770675..fc984f5e560ef 100644
--- a/cmake/configure.cmake
+++ b/cmake/configure.cmake
@@ -51,6 +51,16 @@ if(WIN32)
   endif(NOT MSVC)
 endif(WIN32)
 
+if(WITH_MUSL)
+    add_definitions(-DPADDLE_WITH_MUSL)
+
+    message(STATUS, "Set compile option WITH_MKL=OFF when WITH_MUSL=ON")
+    SET(WITH_MKL OFF)
+
+    message(STATUS, "Set compile option WITH_GPU=OFF when WITH_MUSL=ON")
+    SET(WITH_GPU OFF)
+endif()
+
 if(WITH_PSLIB)
     add_definitions(-DPADDLE_WITH_PSLIB)
 endif()
diff --git a/cmake/init.cmake b/cmake/init.cmake
index 902dfb11fc0af..5f36a9adf1ae6 100644
--- a/cmake/init.cmake
+++ b/cmake/init.cmake
@@ -28,5 +28,6 @@ endif()
 
 if(WIN32)
     set(WIN_PROPS ${CMAKE_SOURCE_DIR}/cmake/paddle_win.props)
+    set(CMAKE_CXX_FLAGS_RELEASE "-O3 -Os -DNDEBUG")
 endif()
 
diff --git a/paddle/fluid/framework/CMakeLists.txt b/paddle/fluid/framework/CMakeLists.txt
index bb5e2e1369a84..d31943289d7a1 100644
--- a/paddle/fluid/framework/CMakeLists.txt
+++ b/paddle/fluid/framework/CMakeLists.txt
@@ -123,7 +123,9 @@ cc_library(attribute SRCS attribute.cc DEPS framework_proto boost enforce)
 cc_test(program_desc_test SRCS program_desc_test.cc DEPS proto_desc
 device_context)
 
-cc_library(op_version_registry SRCS op_version_registry.cc DEPS framework_proto boost)
+cc_library(op_version_proto SRCS op_version_proto.cc DEPS framework_proto boost)
+
+cc_library(op_version_registry SRCS op_version_registry.cc DEPS op_version_proto framework_proto boost)
 cc_test(op_version_registry_test SRCS op_version_registry_test.cc DEPS op_version_registry)
 
 cc_library(op_proto_maker SRCS op_proto_maker.cc DEPS framework_proto attribute glog)
diff --git a/paddle/fluid/framework/framework.proto b/paddle/fluid/framework/framework.proto
index 29312370b3448..c33d71b3b0a9c 100644
--- a/paddle/fluid/framework/framework.proto
+++ b/paddle/fluid/framework/framework.proto
@@ -179,29 +179,15 @@ message BlockDesc {
   optional int32 forward_block_idx = 5 [ default = -1 ];
 }
 
-// CompatibleInfo is used to determine if a feature is compatible and
-// provides the information.
-message CompatibleInfo {
-  enum Type {
-    COMPATIBLE = 0;
-    DEFINITELY_NOT = 1;
-    POSSIBLE = 2;
-    BUG_FIX = 3;
-    PRECISION_CHANGE = 4;
-  }
-  required string version = 1;
-  required Type type = 2;
-}
-
-// In some cases, Paddle Fluid may perform operator definition iterations,
-// and the operator uses OpCompatibleMap for compatibility testing.
-message OpCompatibleMap {
-  message OpCompatiblePair {
+// In some cases, Paddle may perform operator definition iterations,
+// and the operator uses OpVersionMap for compatibility testing.
+message OpVersion { required int32 version = 1; }
+message OpVersionMap {
+  message OpVersionPair {
     required string op_name = 1;
-    required CompatibleInfo compatible_info = 2;
+    required OpVersion op_version = 2;
   }
-  repeated OpCompatiblePair pair = 1;
-  optional string default_required_version = 2;
+  repeated OpVersionPair pair = 1;
 }
 
 // Please refer to
@@ -210,8 +196,8 @@ message OpCompatibleMap {
 // TODO(panyx0718): A model can have multiple programs. Need a
 // way to distinguish them. Maybe ID or name?
 message ProgramDesc {
-  reserved 2; // For backward compatibility.
+  reserved 2, 3; // For backward compatibility.
   repeated BlockDesc blocks = 1;
   optional Version version = 4;
-  optional OpCompatibleMap op_compatible_map = 3;
+  optional OpVersionMap op_version_map = 5;
 }
diff --git a/paddle/fluid/framework/op_compatible_info.cc b/paddle/fluid/framework/op_compatible_info.cc
index 826e14dedb76d..93826fc97b196 100644
--- a/paddle/fluid/framework/op_compatible_info.cc
+++ b/paddle/fluid/framework/op_compatible_info.cc
@@ -182,40 +182,5 @@ OpCompatibleType OpCompatibleMap::IsRequireMiniVersion(
   }
 }
 
-bool OpCompatibleMap::ConvertToProto(proto::OpCompatibleMap* desc) const {
-  desc->Clear();
-  desc->set_default_required_version(default_required_version_);
-  for (auto pair : op_compatible_map_) {
-    const CompatibleInfo& info = pair.second;
-    auto* pair_desc = desc->add_pair();
-    pair_desc->set_op_name(pair.first);
-    auto* info_desc = pair_desc->mutable_compatible_info();
-    info_desc->set_version(info.required_version_);
-    info_desc->set_type(
-        static_cast<proto::CompatibleInfo_Type>(info.compatible_type_));
-  }
-  return true;
-}
-
-bool OpCompatibleMap::ReadFromProto(const proto::OpCompatibleMap& desc) {
-  std::string version = desc.default_required_version();
-  if (version.empty()) {
-    LOG(INFO) << "The default operator required version is missing."
-                 " Please update the model version.";
-    return false;
-  }
-  op_compatible_map_.clear();
-  default_required_version_ = desc.default_required_version();
-  for (int i = 0; i < desc.pair_size(); ++i) {
-    const auto& pair_desc = desc.pair(i);
-    auto info_desc = pair_desc.compatible_info();
-    CompatibleInfo info(info_desc.version(),
-                        static_cast<OpCompatibleType>(info_desc.type()));
-    std::pair<std::string, CompatibleInfo> pair(pair_desc.op_name(), info);
-    op_compatible_map_.insert(pair);
-  }
-  return true;
-}
-
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/op_compatible_info.h b/paddle/fluid/framework/op_compatible_info.h
index 01fbdef99cbbc..6f86b8b64ed21 100644
--- a/paddle/fluid/framework/op_compatible_info.h
+++ b/paddle/fluid/framework/op_compatible_info.h
@@ -58,14 +58,6 @@ class OpCompatibleMap {
   OpCompatibleType IsRequireMiniVersion(std::string op_name,
                                         std::string current_version) const;
 
-  // Convert the entire OpCompatibleMap to Proto, which can be serialized
-  // to the model file as part of the ProgramDesc.
-  bool ConvertToProto(proto::OpCompatibleMap* desc) const;
-
-  // Read and reset the entire object from proto, which can be read from
-  // the model file as part of the program.
-  bool ReadFromProto(const proto::OpCompatibleMap& desc);
-
   const std::string& GetDefaultRequiredVersion() const {
     return default_required_version_;
   }
diff --git a/paddle/fluid/framework/op_compatible_info_test.cc b/paddle/fluid/framework/op_compatible_info_test.cc
index 98f3f5071ad28..cf210ed8ab2d5 100644
--- a/paddle/fluid/framework/op_compatible_info_test.cc
+++ b/paddle/fluid/framework/op_compatible_info_test.cc
@@ -28,12 +28,6 @@ TEST(test_op_compatible_info, test_op_compatible) {
   auto comp_map = OpCompatibleMap();
   comp_map.InitOpCompatibleMap();
 
-  // Ensure save-load consistency.
-  auto program_desc = ProgramDesc();
-  proto::OpCompatibleMap* proto_map = program_desc.OpCompatibleMap();
-  comp_map.ConvertToProto(proto_map);
-  comp_map.ReadFromProto(*proto_map);
-
   ASSERT_NE(comp_map.GetDefaultRequiredVersion(), std::string());
   ASSERT_NE(comp_map.GetOpCompatibleInfo("sequence_pad").required_version_,
             std::string());
diff --git a/paddle/fluid/framework/op_version_proto.cc b/paddle/fluid/framework/op_version_proto.cc
new file mode 100644
index 0000000000000..696e322380740
--- /dev/null
+++ b/paddle/fluid/framework/op_version_proto.cc
@@ -0,0 +1,15 @@
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/framework/op_version_proto.h"
diff --git a/paddle/fluid/framework/op_version_proto.h b/paddle/fluid/framework/op_version_proto.h
new file mode 100644
index 0000000000000..1a876f43d2f00
--- /dev/null
+++ b/paddle/fluid/framework/op_version_proto.h
@@ -0,0 +1,55 @@
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <string>
+#include "paddle/fluid/framework/framework.pb.h"
+
+namespace paddle {
+namespace framework {
+namespace compatible {
+namespace pb {
+
+class OpVersion {
+ public:
+  explicit OpVersion(proto::OpVersion* desc) : desc_{desc} {}
+  void SetVersionID(uint32_t version) { desc_->set_version(version); }
+
+ private:
+  proto::OpVersion* desc_;
+};
+
+class OpVersionMap {
+ public:
+  explicit OpVersionMap(proto::OpVersionMap* desc) : desc_{desc} {}
+  OpVersion operator[](const std::string& key) {
+    for (int i = 0; i < desc_->pair_size(); ++i) {
+      if (desc_->pair(i).op_name() == key) {
+        return OpVersion(desc_->mutable_pair(i)->mutable_op_version());
+      }
+    }
+    auto* pair = desc_->add_pair();
+    pair->set_op_name(key);
+    return OpVersion(pair->mutable_op_version());
+  }
+
+ private:
+  proto::OpVersionMap* desc_;
+};
+
+}  // namespace pb
+}  // namespace compatible
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/op_version_registry.cc b/paddle/fluid/framework/op_version_registry.cc
index 11b7224e68340..9a67c160f0233 100644
--- a/paddle/fluid/framework/op_version_registry.cc
+++ b/paddle/fluid/framework/op_version_registry.cc
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
diff --git a/paddle/fluid/framework/op_version_registry.h b/paddle/fluid/framework/op_version_registry.h
index fea043a0ff311..5ddaf1bd8d8ce 100644
--- a/paddle/fluid/framework/op_version_registry.h
+++ b/paddle/fluid/framework/op_version_registry.h
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -22,6 +22,7 @@ limitations under the License. */
 
 #include <boost/any.hpp>
 #include "paddle/fluid/framework/framework.pb.h"
+#include "paddle/fluid/framework/op_version_proto.h"
 #include "paddle/fluid/platform/enforce.h"
 
 namespace paddle {
@@ -159,12 +160,14 @@ class OpVersionRegistrar {
     op_version_map_.insert({op_type, OpVersion()});
     return op_version_map_[op_type];
   }
+  const std::unordered_map<std::string, OpVersion>& GetVersionMap() {
+    return op_version_map_;
+  }
   uint32_t GetVersionID(const std::string& op_type) const {
     auto it = op_version_map_.find(op_type);
     if (it == op_version_map_.end()) {
       return 0;
     }
-
     return it->second.GetVersionID();
   }
 
@@ -175,6 +178,14 @@ class OpVersionRegistrar {
   OpVersionRegistrar& operator=(const OpVersionRegistrar&) = delete;
 };
 
+inline void SaveOpVersions(
+    const std::unordered_map<std::string, OpVersion>& src,
+    pb::OpVersionMap* dst) {
+  for (const auto& pair : src) {
+    (*dst)[pair.first].SetVersionID(pair.second.GetVersionID());
+  }
+}
+
 class OpVersionComparator {
  public:
   virtual bool operator()() = 0;
diff --git a/paddle/fluid/framework/op_version_registry_test.cc b/paddle/fluid/framework/op_version_registry_test.cc
index d6b18751cefe5..2b173c9571588 100644
--- a/paddle/fluid/framework/op_version_registry_test.cc
+++ b/paddle/fluid/framework/op_version_registry_test.cc
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
 
    Licensed under the Apache License, Version 2.0 (the "License");
    you may not use this file except in compliance with the License.
diff --git a/paddle/fluid/framework/program_desc.cc b/paddle/fluid/framework/program_desc.cc
index d37a16a3e7d9f..0faa870f50565 100644
--- a/paddle/fluid/framework/program_desc.cc
+++ b/paddle/fluid/framework/program_desc.cc
@@ -39,8 +39,8 @@ proto::ProgramDesc *ProgramDesc::Proto() {
   return &desc_;
 }
 
-proto::OpCompatibleMap *ProgramDesc::OpCompatibleMap() {
-  return desc_.mutable_op_compatible_map();
+proto::OpVersionMap *ProgramDesc::OpVersionMap() {
+  return desc_.mutable_op_version_map();
 }
 
 int64_t ProgramDesc::Version() const { return desc_.version().version(); }
diff --git a/paddle/fluid/framework/program_desc.h b/paddle/fluid/framework/program_desc.h
index 5cafc9111da67..8b1aac95fc288 100644
--- a/paddle/fluid/framework/program_desc.h
+++ b/paddle/fluid/framework/program_desc.h
@@ -58,7 +58,7 @@ class ProgramDesc {
 
   proto::ProgramDesc *Proto();
 
-  proto::OpCompatibleMap *OpCompatibleMap();
+  proto::OpVersionMap *OpVersionMap();
 
   int64_t Version() const;
 
diff --git a/paddle/fluid/inference/CMakeLists.txt b/paddle/fluid/inference/CMakeLists.txt
index f85e1f6511656..6d35d3395ba60 100644
--- a/paddle/fluid/inference/CMakeLists.txt
+++ b/paddle/fluid/inference/CMakeLists.txt
@@ -88,7 +88,7 @@ if(NOT APPLE AND NOT WIN32)
   set_target_properties(paddle_fluid_shared PROPERTIES LINK_FLAGS "${LINK_FLAGS}")
   # check symbol hidden
   FILE(WRITE ${CMAKE_CURRENT_BINARY_DIR}/check_symbol.cmake
-    "execute_process(COMMAND bash -c \"${CMAKE_CURRENT_SOURCE_DIR}/check_symbol.sh"
+    "execute_process(COMMAND sh -c \"${CMAKE_CURRENT_SOURCE_DIR}/check_symbol.sh"
     " ${CMAKE_CURRENT_BINARY_DIR}/libpaddle_fluid.so\" RESULT_VARIABLE symbol_res)\n"
     "if(NOT \"\${symbol_res}\" STREQUAL \"0\")\n"
     "  message(FATAL_ERROR \"Check symbol failed.\")\n"
diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc
index 6c68b385bcbc0..98bee2d4bb471 100644
--- a/paddle/fluid/inference/api/analysis_predictor.cc
+++ b/paddle/fluid/inference/api/analysis_predictor.cc
@@ -192,11 +192,6 @@ bool AnalysisPredictor::PrepareProgram(
     // If config_.ir_optim() is False, parameters is loaded in LoadParameters(),
     // still need to create other persistable variables.
     // So in both case, create persistable variables at first.
-    if (!CheckOperatorCompatible()) {
-      LOG(WARNING) << "WARNING: Results may be DIFF! "
-                      "Please use the corresponding version of the model and "
-                      "prediction library, and do not use the develop branch.";
-    }
     executor_->CreateVariables(*inference_program_, 0, true, sub_scope_);
 
     // if enable_ir_optim_ is false,
@@ -998,40 +993,6 @@ std::string AnalysisPredictor::GetSerializedProgram() const {
   return inference_program_->Proto()->SerializeAsString();
 }
 
-bool AnalysisPredictor::CheckOperatorCompatible() {
-  if (!inference_program_) {
-    PADDLE_THROW(platform::errors::PreconditionNotMet(
-        "Inference program version check failed because the program does not "
-        "exist."));
-    return false;
-  }
-  bool res = true;
-  op_compatible_map_.ReadFromProto(*inference_program_->OpCompatibleMap());
-  const auto &version = framework::DumpVersion(framework::kCurProgramVersion);
-  LOG(INFO) << "MODEL VERSION: "
-            << framework::DumpVersion(inference_program_->Version());
-  LOG(INFO) << "PREDICTOR VERSION: " << version;
-  std::set<std::string> op_types;
-  for (size_t i = 0; i < inference_program_->Size(); ++i) {
-    const auto &block = inference_program_->Block(i);
-    for (const auto *op : block.AllOps()) {
-      op_types.insert(op->Type());
-    }
-  }
-  for (const auto type : op_types) {
-    auto compatible_type =
-        op_compatible_map_.IsRequireMiniVersion(type, version);
-    if (compatible_type != framework::OpCompatibleType::compatible) {
-      if (!framework::kCurProgramVersion) {
-        LOG(WARNING) << " - Version incompatible ("
-                     << static_cast<int>(compatible_type) << ") " << type;
-      }
-      res = false;
-    }
-  }
-  return res;
-}
-
 // Add SaveOptimModel
 void AnalysisPredictor::SaveOptimModel(const std::string &dir) {
   // save model
diff --git a/paddle/fluid/inference/api/analysis_predictor.h b/paddle/fluid/inference/api/analysis_predictor.h
index c4a7173b0104b..269f2fd80bb47 100644
--- a/paddle/fluid/inference/api/analysis_predictor.h
+++ b/paddle/fluid/inference/api/analysis_predictor.h
@@ -335,13 +335,6 @@ class AnalysisPredictor : public PaddlePredictor {
   /// AnalysisPredictor::ZeroCopyRun() now.
   ///
   void MkldnnPostReset();
-  ///
-  /// \brief Compute compatibility based on model version information and
-  /// operator version information
-  ///
-  /// \return Compatible information
-  ///
-  bool CheckOperatorCompatible();
 
 #if PADDLE_WITH_TENSORRT
   ///
diff --git a/paddle/fluid/inference/check_symbol.sh b/paddle/fluid/inference/check_symbol.sh
index b6b7d1f20baf7..a0f64796576c8 100755
--- a/paddle/fluid/inference/check_symbol.sh
+++ b/paddle/fluid/inference/check_symbol.sh
@@ -1,4 +1,4 @@
-#!/bin/bash
+#!/bin/sh
 
 lib=$1
 if [ $# -ne 1 ]; then echo "No input library"; exit -1 ; fi
diff --git a/paddle/fluid/operators/cudnn_lstm_op.cc b/paddle/fluid/operators/cudnn_lstm_op.cc
index 50486ad041aa4..31f0c26a3f3a1 100644
--- a/paddle/fluid/operators/cudnn_lstm_op.cc
+++ b/paddle/fluid/operators/cudnn_lstm_op.cc
@@ -15,6 +15,7 @@ limitations under the License. */
 #include <memory>
 #include <string>
 #include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/op_version_registry.h"
 
 namespace paddle {
 namespace operators {
@@ -25,7 +26,6 @@ class CudnnLSTMOp : public framework::OperatorWithKernel {
 
   void InferShape(framework::InferShapeContext* ctx) const override {
     OP_INOUT_CHECK(ctx->HasInput("Input"), "Input", "Input", "CudnnLSTM");
-    OP_INOUT_CHECK(ctx->HasInput("W"), "Input", "W", "CudnnLSTM");
     OP_INOUT_CHECK(ctx->HasInput("InitH"), "Input", "InitH", "CudnnLSTM");
     OP_INOUT_CHECK(ctx->HasInput("InitC"), "Input", "InitC", "CudnnLSTM");
 
@@ -122,7 +122,13 @@ class CudnnLSTMOpMaker : public framework::OpProtoAndCheckerMaker {
     AddInput("W",
              "(Tensor) the learnable hidden-hidden weights."
              " The shape is (N), where N is total weight size of the LSTM. "
-             " cudnn concatenate all the weight to one Tensor");
+             " cudnn concatenate all the weight to one Tensor")
+        .AsDispensable();
+    AddInput("WeightList",
+             "(vector<Tensor>), stores weight and bias data when the weight "
+             "use the list format. ")
+        .AsDispensable()
+        .AsDuplicable();
     AddInput("SequenceLength",
              "(Tensor) When the input data is padding, "
              "set this parameter. This parameter represents "
@@ -216,7 +222,6 @@ class CudnnLSTMGradOp : public framework::OperatorWithKernel {
 
   void InferShape(framework::InferShapeContext* ctx) const override {
     OP_INOUT_CHECK(ctx->HasInput("Input"), "Input", "Input", "CudnnLSTMGrad");
-    OP_INOUT_CHECK(ctx->HasInput("W"), "Input", "W", "CudnnLSTMGrad");
     OP_INOUT_CHECK(ctx->HasInput("InitH"), "Input", "InitH", "CudnnLSTMGrad");
     OP_INOUT_CHECK(ctx->HasInput("InitC"), "Input", "InitC", "CudnnLSTMGrad");
 
@@ -228,7 +233,10 @@ class CudnnLSTMGradOp : public framework::OperatorWithKernel {
     };
 
     SetOutGradDim("Input");
-    SetOutGradDim("W");
+    if (ctx->HasInputs("WeightList")) {
+      ctx->SetOutputsDim(framework::GradVarName("WeightList"),
+                         ctx->GetInputsDim("WeightList"));
+    }
     SetOutGradDim("InitH");
     SetOutGradDim("InitC");
   }
@@ -251,7 +259,9 @@ class CudnnLSTMGradOpMaker : public framework::SingleGradOpMaker<T> {
     op->SetInput("Input", this->Input("Input"));
     op->SetInput("InitH", this->Input("InitH"));
     op->SetInput("InitC", this->Input("InitC"));
-    op->SetInput("W", this->Input("W"));
+    if (this->HasInput("WeightList")) {
+      op->SetInput("WeightList", this->Input("WeightList"));
+    }
     if (this->HasInput("SequenceLength")) {
       op->SetInput("SequenceLength", this->Input("SequenceLength"));
     }
@@ -262,8 +272,12 @@ class CudnnLSTMGradOpMaker : public framework::SingleGradOpMaker<T> {
     op->SetInput(framework::GradVarName("LastC"), this->OutputGrad("LastC"));
     op->SetInput(framework::GradVarName("LastH"), this->OutputGrad("LastH"));
 
+    if (this->HasInput("WeightList")) {
+      op->SetOutput(framework::GradVarName("WeightList"),
+                    this->InputGrad("WeightList", false));
+    }
+
     op->SetOutput(framework::GradVarName("Input"), this->InputGrad("Input"));
-    op->SetOutput(framework::GradVarName("W"), this->InputGrad("W"));
     op->SetOutput(framework::GradVarName("InitH"), this->InputGrad("InitH"));
     op->SetOutput(framework::GradVarName("InitC"), this->InputGrad("InitC"));
     op->SetAttrMap(this->Attrs());
@@ -290,3 +304,20 @@ REGISTER_OPERATOR(cudnn_lstm_grad, ops::CudnnLSTMGradOp);
 
 REGISTER_OP_CPU_KERNEL(cudnn_lstm, ops::NotImpleKernel<float>);
 REGISTER_OP_CPU_KERNEL(cudnn_lstm_grad, ops::NotImpleKernel<float>);
+
+// TODO(Shixiaowei02) Add ModifyInput support
+REGISTER_OP_VERSION(cudnn_lstm)
+    .AddCheckpoint(
+        R"ROC(
+              Upgrade cudnn_lstm add a new input [WeightList] and modify input [W] to dispensable.)ROC",
+        paddle::framework::compatible::OpVersionDesc()
+            .NewInput(
+                "WeightList",
+                "The WeightList stores weight and bias data. WeightList is "
+                "dispensable.")
+            .NewInput("SequenceLength",
+                      "When the input data is padding, set this parameter. "
+                      "SequenceLength is dispensable.")
+            .NewOutput("StateOut", "Store the global drop state when training")
+            .NewOutput("Reserve",
+                       "A temporary output Tensor to store the reserve_data"));
diff --git a/paddle/fluid/operators/cudnn_lstm_op.cu.cc b/paddle/fluid/operators/cudnn_lstm_op.cu.cc
index 6ac75b78d7058..bea7d9c02ca7d 100644
--- a/paddle/fluid/operators/cudnn_lstm_op.cu.cc
+++ b/paddle/fluid/operators/cudnn_lstm_op.cu.cc
@@ -30,6 +30,66 @@ namespace operators {
 using LoDTensor = framework::LoDTensor;
 using Tensor = framework::Tensor;
 
+template <typename T, typename Type>
+bool is_continuous(const Type &weight_list) {
+  bool continuous = true;
+  for (size_t i = 0; i < weight_list.size() - 1; ++i) {
+    auto *in_data = weight_list[i]->template data<T>();
+    auto *in_after_data = weight_list[i + 1]->template data<T>();
+    auto in_size = weight_list[i]->numel();
+    bool temp = in_data + in_size == in_after_data;
+    continuous = continuous && temp;
+  }
+  return continuous;
+}
+
+int size_sum(const std::vector<const Tensor *> &weight_list) {
+  int size = 0;
+  for (size_t i = 0; i < weight_list.size(); ++i) {
+    auto in_size = weight_list[i]->numel();
+    size += in_size;
+  }
+  return size;
+}
+
+template <typename T>
+void weight_to_tensor(const platform::Place &place, cudaStream_t stream,
+                      const std::vector<const Tensor *> &weight_list,
+                      Tensor *weight) {
+  auto weight_data = weight->data<T>();
+  int weight_offset = 0;
+  for (size_t i = 0; i < weight_list.size(); ++i) {
+    const T *in_data = weight_list[i]->data<T>();
+    auto in_size = weight_list[i]->numel();
+
+    memory::Copy(BOOST_GET_CONST(platform::CUDAPlace, weight->place()),
+                 weight_data + weight_offset,
+                 BOOST_GET_CONST(platform::CUDAPlace, weight_list[i]->place()),
+                 in_data, in_size * sizeof(T), stream);
+    weight_offset += in_size;
+  }
+}
+
+template <typename T>
+void weight_to_tensor_list(const platform::Place &place, cudaStream_t stream,
+                           std::vector<Tensor *> *weight_grad,
+                           const std::vector<const Tensor *> &weight_input,
+                           const Tensor *weight) {
+  int weight_offset = 0;
+  auto *weight_data = weight->data<T>();
+  for (size_t i = 0; i < weight_input.size(); ++i) {
+    auto in_size = weight_input[i]->numel();
+    T *weight_grad_data = (*weight_grad)[i]->mutable_data<T>(place);
+    const T *src = weight_data + weight_offset;
+
+    memory::Copy(
+        BOOST_GET_CONST(platform::CUDAPlace, (*weight_grad)[i]->place()),
+        weight_grad_data, BOOST_GET_CONST(platform::CUDAPlace, weight->place()),
+        src, in_size * sizeof(T), stream);
+    weight_offset += in_size;
+  }
+}
+
 template <typename T>
 void LSTMInferece(const bool &has_seq_length, const cudnnHandle_t &handle,
                   const int &seq_length, ScopedRNNBase *rnn, const T *x_data,
@@ -75,8 +135,6 @@ class CudnnLSTMGPUKernel : public framework::OpKernel<T> {
     const Tensor *init_h = ctx.Input<Tensor>("InitH");
     const Tensor *init_c = ctx.Input<Tensor>("InitC");
 
-    auto w = ctx.Input<Tensor>("W");
-
     Tensor *out = ctx.Output<Tensor>("Out");
     Tensor *last_h = ctx.Output<Tensor>("LastH");
     Tensor *last_c = ctx.Output<Tensor>("LastC");
@@ -87,8 +145,6 @@ class CudnnLSTMGPUKernel : public framework::OpKernel<T> {
     const T *init_h_data = init_h->data<T>();
     const T *init_c_data = init_c->data<T>();
 
-    const T *w_data = w->data<T>();
-
     T *out_data = out->mutable_data<T>(ctx.GetPlace());
     T *last_h_data = last_h->mutable_data<T>(ctx.GetPlace());
     T *last_c_data = last_c->mutable_data<T>(ctx.GetPlace());
@@ -113,11 +169,45 @@ class CudnnLSTMGPUKernel : public framework::OpKernel<T> {
     int seq_length = x->dims()[0];
     int batch_size = x->dims()[1];
     int input_size = x->dims()[2];
-    int weight_numel = w->numel();
     bool state_initialized = state_out->IsInitialized() ? true : false;
 
     size_t workspace_size;
     size_t reserve_size;
+    Tensor weight_whole;
+    T *w_data = nullptr;
+    int weight_numel;
+    bool w_initialized = false;
+    auto place = ctx.GetPlace();
+    auto stream = reinterpret_cast<const platform::CUDADeviceContext &>(
+                      ctx.device_context())
+                      .stream();
+    if (is_test && ctx.HasInput("W")) {
+      auto *W = ctx.Input<Tensor>("W");
+      w_initialized = W->IsInitialized() ? true : false;
+      weight_numel = W->numel();
+    }
+    if (!w_initialized) {
+      auto weight_list = ctx.MultiInput<framework::Tensor>("WeightList");
+      bool continuous =
+          is_continuous<T, std::vector<const Tensor *>>(weight_list);
+      weight_numel = size_sum(weight_list);
+
+      if (!continuous) {
+        LOG_FIRST_N(WARNING, 2)
+            << "If the memory space of the Input WeightList is not "
+               "continuous, less efficient calculation will be "
+               "called. Please call coalesce_tensor op to make the "
+               "input memory continuous.";
+        weight_whole.mutable_data<T>({weight_numel}, place);
+        weight_to_tensor<T>(place, stream, weight_list, &weight_whole);
+        w_data = weight_whole.data<T>();
+      } else {
+        w_data = const_cast<T *>(weight_list[0]->data<T>());
+      }
+    } else {
+      auto *W = ctx.Input<Tensor>("W");
+      w_data = const_cast<T *>(W->data<T>());
+    }
 
     ScopedRNNBase rnn(seq_length, batch_size, input_size, hidden_size,
                       num_layers, dropout_prob, seed, weight_numel,
@@ -136,6 +226,12 @@ class CudnnLSTMGPUKernel : public framework::OpKernel<T> {
       LSTMInferece<T>(has_seq_length, handle, seq_length, &rnn, x_data,
                       init_h_data, init_c_data, w_data, out_data, last_h_data,
                       last_c_data, &workspace_data_, workspace_size);
+      if (!w_initialized && ctx.HasInput("W") && ctx.HasInput("WeightList")) {
+        auto *W = const_cast<Tensor *>(ctx.Input<Tensor>("W"));
+        auto weight_list = ctx.MultiInput<framework::Tensor>("WeightList");
+        W->mutable_data<T>({weight_numel}, place);
+        weight_to_tensor<T>(place, stream, weight_list, W);
+      }
     } else {
       if (!has_seq_length) {
         // for train
@@ -176,11 +272,11 @@ class CudnnLSTMGPUGradKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext &ctx) const override {
     auto *input = ctx.Input<Tensor>("Input");
-    auto *weight = ctx.Input<Tensor>("W");
     auto *init_h = ctx.Input<Tensor>("InitH");
     auto *init_c = ctx.Input<Tensor>("InitC");
     auto *reserve = ctx.Input<Tensor>("Reserve");
     auto *state_out = ctx.Input<Tensor>("StateOut");
+    auto weight_list = ctx.MultiInput<Tensor>("WeightList");
 
     auto *out = ctx.Input<Tensor>("Out");
     auto *out_grad = ctx.Input<Tensor>(framework::GradVarName("Out"));
@@ -188,9 +284,10 @@ class CudnnLSTMGPUGradKernel : public framework::OpKernel<T> {
     auto *last_c_grad = ctx.Input<Tensor>(framework::GradVarName("LastC"));
 
     auto *in_grad = ctx.Output<Tensor>(framework::GradVarName("Input"));
-    auto *weight_grad = ctx.Output<Tensor>(framework::GradVarName("W"));
     auto *init_h_grad = ctx.Output<Tensor>(framework::GradVarName("InitH"));
     auto *init_c_grad = ctx.Output<Tensor>(framework::GradVarName("InitC"));
+    auto weight_grad_list = ctx.MultiOutput<framework::Tensor>(
+        framework::GradVarName("WeightList"));
 
     auto &dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
     auto handle = dev_ctx.cudnn_handle();
@@ -199,7 +296,6 @@ class CudnnLSTMGPUGradKernel : public framework::OpKernel<T> {
     auto init_h_dims = init_h->dims();
     auto init_c_dims = init_c->dims();
 
-    auto *weight_data = weight->data<T>();
     auto *init_h_data = init_h->data<T>();
     auto *init_c_data = init_c->data<T>();
     auto *out_data = out->data<T>();
@@ -207,18 +303,50 @@ class CudnnLSTMGPUGradKernel : public framework::OpKernel<T> {
     auto *last_h_grad_data = last_h_grad->data<T>();
     auto *last_c_grad_data = last_c_grad->data<T>();
 
+    auto place = ctx.GetPlace();
+    int weight_numel = size_sum(weight_list);
+    bool continuous =
+        is_continuous<T, std::vector<const Tensor *>>(weight_list);
+
+    auto stream = reinterpret_cast<const platform::CUDADeviceContext &>(
+                      ctx.device_context())
+                      .stream();
+    Tensor weight_whole;
+    T *weight_data = nullptr;
+
+    if (!continuous) {
+      weight_whole.mutable_data<T>({weight_numel}, place);
+      weight_to_tensor<T>(place, stream, weight_list, &weight_whole);
+      weight_data = weight_whole.data<T>();
+    } else {
+      weight_data = const_cast<T *>(weight_list[0]->data<T>());
+    }
+
+    Tensor weight_grad;
     math::SetConstant<paddle::platform::CUDADeviceContext, T> zero;
-    weight_grad->mutable_data<T>(ctx.GetPlace());
-    zero(dev_ctx, weight_grad, static_cast<T>(0.0));
+    weight_grad.mutable_data<T>({weight_numel}, ctx.GetPlace());
+    zero(dev_ctx, &weight_grad, static_cast<T>(0.0));
+    T *weight_grad_data = weight_grad.data<T>();
+
+    int offset = 0;
+    for (size_t i = 0; i < weight_grad_list.size(); ++i) {
+      size_t len = weight_grad_list[i]->numel();
+      auto dim = weight_grad_list[i]->dims();
+      weight_grad_list[i]
+          ->ShareDataWith(weight_grad.Slice(static_cast<int64_t>(offset),
+                                            static_cast<int64_t>(offset + len)))
+          .Resize(dim);
+      offset += len;
+    }
 
     in_grad->mutable_data<T>(input_dims, ctx.GetPlace());
     auto *in_grad_data = in_grad->data<T>();
 
-    init_h_grad->mutable_data<T>(init_h_dims, ctx.GetPlace());
-    auto *init_h_grad_data = init_h_grad->data<T>();
+    if (init_h_grad) init_h_grad->mutable_data<T>(init_h_dims, ctx.GetPlace());
+    auto *init_h_grad_data = init_h_grad ? init_h_grad->data<T>() : nullptr;
 
-    init_c_grad->mutable_data<T>(init_c_dims, ctx.GetPlace());
-    auto *init_c_grad_data = init_c_grad->data<T>();
+    if (init_c_grad) init_c_grad->mutable_data<T>(init_c_dims, ctx.GetPlace());
+    auto *init_c_grad_data = init_c_grad ? init_c_grad->data<T>() : nullptr;
 
     float dropout_prob = ctx.Attr<float>("dropout_prob");
     bool is_bidirec = ctx.Attr<bool>("is_bidirec");
@@ -236,7 +364,6 @@ class CudnnLSTMGPUGradKernel : public framework::OpKernel<T> {
     int seq_length = input_dims[0];
     int batch_size = input->dims()[1];
     int input_size = input->dims()[2];
-    int weight_numel = weight->numel();
 
     size_t workspace_size;
     size_t reserve_size;
@@ -268,8 +395,7 @@ class CudnnLSTMGPUGradKernel : public framework::OpKernel<T> {
           handle, rnn.rnn_desc(), seq_length, rnn.x_descs(), input->data<T>(),
           rnn.init_h_desc(), init_h->data<T>(), rnn.y_descs(), out->data<T>(),
           workspace_data_.data<uint8_t>(), workspace_size, rnn.weight_desc(),
-          weight_grad->data<T>(), const_cast<uint8_t *>(reserve_data),
-          reserve_size));
+          weight_grad_data, const_cast<uint8_t *>(reserve_data), reserve_size));
     } else {
 #if CUDNN_VERSION >= 7201
       // for train
@@ -288,7 +414,7 @@ class CudnnLSTMGPUGradKernel : public framework::OpKernel<T> {
           handle, rnn.rnn_desc(), rnn.x_seq_desc(), input->data<T>(),
           rnn.init_h_desc(), init_h->data<T>(), rnn.y_seq_desc(),
           out->data<T>(), workspace_data_.data<uint8_t>(), workspace_size,
-          rnn.weight_desc(), weight_grad->data<T>(),
+          rnn.weight_desc(), weight_grad_data,
           const_cast<uint8_t *>(reserve_data), reserve_size));
 #else
       PADDLE_THROW(platform::errors::Unavailable(
diff --git a/paddle/fluid/operators/fake_quantize_op.cc b/paddle/fluid/operators/fake_quantize_op.cc
index e9b4c7dacf8b4..04fa8db9a5a6f 100644
--- a/paddle/fluid/operators/fake_quantize_op.cc
+++ b/paddle/fluid/operators/fake_quantize_op.cc
@@ -404,6 +404,10 @@ class FakeChannelWiseQuantizeAbsMaxOpMaker
                                 "the received is %d",
                                 bit_length));
         });
+    AddAttr<bool>("is_test",
+                  "(bool, default false) Set to true for inference only, false "
+                  "for training. Some layers may run faster when this is true.")
+        .SetDefault(false);
     AddComment(R"DOC(
 The scale of FakeChannelWiseQuantize operator is a vector.
 In detail, each channel of the input X has a scale value.
diff --git a/paddle/fluid/operators/fake_quantize_op.h b/paddle/fluid/operators/fake_quantize_op.h
index 2f5afbe0eedf9..94a75f930beba 100644
--- a/paddle/fluid/operators/fake_quantize_op.h
+++ b/paddle/fluid/operators/fake_quantize_op.h
@@ -146,16 +146,19 @@ class FakeChannelWiseQuantizeAbsMaxKernel : public framework::OpKernel<T> {
 
     auto* out = context.Output<framework::Tensor>("Out");
     auto* out_scale = context.Output<framework::Tensor>("OutScale");
-    T* out_scale_data = out_scale->mutable_data<T>(context.GetPlace());
     out->mutable_data<T>(context.GetPlace());
 
     int bit_length = context.Attr<int>("bit_length");
     int bin_cnt = std::pow(2, bit_length - 1) - 1;
     int quant_axis = context.Attr<int>("quant_axis");
+    bool is_test = context.Attr<bool>("is_test");
 
     auto& dev_ctx = context.template device_context<DeviceContext>();
-    FindChannelAbsMaxFunctor<DeviceContext, T>()(dev_ctx, *in, quant_axis,
-                                                 out_scale_data);
+    if (!is_test) {
+      T* out_scale_data = out_scale->mutable_data<T>(context.GetPlace());
+      FindChannelAbsMaxFunctor<DeviceContext, T>()(dev_ctx, *in, quant_axis,
+                                                   out_scale_data);
+    }
     ChannelClipAndFakeQuantFunctor<DeviceContext, T>()(
         dev_ctx, *in, *out_scale, bin_cnt, quant_axis, out);
   }
diff --git a/paddle/fluid/operators/instance_norm_op.cc b/paddle/fluid/operators/instance_norm_op.cc
index 03279a9b2c15b..1018adcd930a4 100644
--- a/paddle/fluid/operators/instance_norm_op.cc
+++ b/paddle/fluid/operators/instance_norm_op.cc
@@ -181,10 +181,22 @@ class InstanceNormKernel<platform::CPUDeviceContext, T>
     auto &dev_ctx = ctx.template device_context<platform::CPUDeviceContext>();
     auto *place = dev_ctx.eigen_device();
 
+    Eigen::DSizes<int, 2> shape(NxC, sample_size);
+// Once eigen on Windows is updated, the if branch can be removed.
+#ifndef EIGEN_HAS_INDEX_LIST
     Eigen::DSizes<int, 2> bcast(1, sample_size);
     Eigen::DSizes<int, 2> C_shape(C, 1);
     Eigen::DSizes<int, 2> NxC_shape(NxC, 1);
-    Eigen::DSizes<int, 2> shape(NxC, sample_size);
+    Eigen::DSizes<int, 1> rdims(1);
+#else
+    Eigen::IndexList<Eigen::type2index<1>, int> bcast;
+    bcast.set(1, sample_size);
+    Eigen::IndexList<int, Eigen::type2index<1>> C_shape;
+    C_shape.set(0, C);
+    Eigen::IndexList<int, Eigen::type2index<1>> NxC_shape;
+    NxC_shape.set(0, NxC);
+    Eigen::IndexList<Eigen::type2index<1>> rdims;
+#endif
 
     math::SetConstant<platform::CPUDeviceContext, T> set_constant;
 
@@ -201,8 +213,6 @@ class InstanceNormKernel<platform::CPUDeviceContext, T>
     auto x_e = framework::EigenVector<T>::Flatten(*x);
     auto x_arr = x_e.reshape(shape);
 
-    Eigen::DSizes<int, 1> rdims(1);
-
     saved_mean_e.device(*place) = x_arr.mean(rdims);
     auto saved_variance_arr =
         (x_arr - saved_mean_e.broadcast(bcast)).square().mean(rdims) + epsilon;
@@ -316,14 +326,25 @@ class InstanceNormGradKernel<platform::CPUDeviceContext, T>
     auto &dev_ctx = ctx.template device_context<platform::CPUDeviceContext>();
     auto *place = dev_ctx.eigen_device();
 
+    Eigen::DSizes<int, 2> rshape(NxC, sample_size);
+    Eigen::DSizes<int, 2> param_shape(N, C);
+    Eigen::DSizes<int, 2> shape(NxC, sample_size);
+#ifndef EIGEN_HAS_INDEX_LIST
     Eigen::DSizes<int, 1> rdims(0);
     Eigen::DSizes<int, 1> mean_rdims(1);
-    Eigen::DSizes<int, 2> rshape(NxC, sample_size);
     Eigen::DSizes<int, 2> bcast(1, sample_size);
     Eigen::DSizes<int, 2> C_shape(C, 1);
     Eigen::DSizes<int, 2> NxC_shape(NxC, 1);
-    Eigen::DSizes<int, 2> param_shape(N, C);
-    Eigen::DSizes<int, 2> shape(NxC, sample_size);
+#else
+    Eigen::IndexList<Eigen::type2index<0>> rdims;
+    Eigen::IndexList<Eigen::type2index<1>> mean_rdims;
+    Eigen::IndexList<Eigen::type2index<1>, int> bcast;
+    bcast.set(1, sample_size);
+    Eigen::IndexList<int, Eigen::type2index<1>> C_shape;
+    C_shape.set(0, C);
+    Eigen::IndexList<int, Eigen::type2index<1>> NxC_shape;
+    NxC_shape.set(0, NxC);
+#endif
 
     math::SetConstant<platform::CPUDeviceContext, T> set_constant;
 
diff --git a/paddle/fluid/operators/math/pooling.cc b/paddle/fluid/operators/math/pooling.cc
index 40cea7483f397..fec738378a64c 100644
--- a/paddle/fluid/operators/math/pooling.cc
+++ b/paddle/fluid/operators/math/pooling.cc
@@ -60,19 +60,25 @@ class Pool2dFunctor<platform::CPUDeviceContext, PoolProcess, T> {
           if (adaptive) {
             hstart = AdaptStartIndex(ph, input_height, output_height);
             hend = AdaptEndIndex(ph, input_height, output_height);
-          } else {
-            hstart = ph * stride_height - padding_height;
-            hend = std::min(hstart + ksize_height, input_height);
-            hstart = std::max(hstart, 0);
           }
           for (int pw = 0; pw < output_width; ++pw) {
+            int pool_size = 1;
             if (adaptive) {
               wstart = AdaptStartIndex(pw, input_width, output_width);
               wend = AdaptEndIndex(pw, input_width, output_width);
             } else {
+              hstart = ph * stride_height - padding_height;
               wstart = pw * stride_width - padding_width;
-              wend = std::min(wstart + ksize_width, input_width);
+              hend = std::min(hstart + ksize_height,
+                              input_height + padding_height);
+              wend =
+                  std::min(wstart + ksize_width, input_width + padding_width);
+              pool_size = (hend - hstart) * (wend - wstart);
+
               wstart = std::max(wstart, 0);
+              hstart = std::max(hstart, 0);
+              hend = std::min(hend, input_height);
+              wend = std::min(wend, input_width);
             }
 
             T ele = pool_process.initial();
@@ -81,9 +87,10 @@ class Pool2dFunctor<platform::CPUDeviceContext, PoolProcess, T> {
                 pool_process.compute(input_data[h * input_width + w], &ele);
               }
             }
-            int pool_size = (exclusive || adaptive)
-                                ? (hend - hstart) * (wend - wstart)
-                                : ksize_height * ksize_width;
+            if (exclusive || adaptive) {
+              pool_size = (hend - hstart) * (wend - wstart);
+            }
+
             pool_process.finalize(static_cast<T>(pool_size), &ele);
             output_data[ph * output_width + pw] = ele;
           }
@@ -137,19 +144,25 @@ class Pool2dFunctor<platform::CPUDeviceContext, PoolProcess, T> {
             if (adaptive) {
               hstart = AdaptStartIndex(ph, input_height, output_height);
               hend = AdaptEndIndex(ph, input_height, output_height);
-            } else {
-              hstart = ph * stride_height - padding_height;
-              hend = std::min(hstart + ksize_height, input_height);
-              hstart = std::max(hstart, 0);
             }
             for (int pw = 0; pw < output_width; ++pw) {
+              int pool_size = 1;
               if (adaptive) {
                 wstart = AdaptStartIndex(pw, input_width, output_width);
                 wend = AdaptEndIndex(pw, input_width, output_width);
               } else {
+                hstart = ph * stride_height - padding_height;
                 wstart = pw * stride_width - padding_width;
-                wend = std::min(wstart + ksize_width, input_width);
+                hend = std::min(hstart + ksize_height,
+                                input_height + padding_height);
+                wend =
+                    std::min(wstart + ksize_width, input_width + padding_width);
+                pool_size = (hend - hstart) * (wend - wstart);
+
                 wstart = std::max(wstart, 0);
+                hstart = std::max(hstart, 0);
+                hend = std::min(hend, input_height);
+                wend = std::min(wend, input_width);
               }
 
               T ele = pool_process.initial();
@@ -158,9 +171,9 @@ class Pool2dFunctor<platform::CPUDeviceContext, PoolProcess, T> {
                   pool_process.compute(input_data[h * input_width + w], &ele);
                 }
               }
-              int pool_size = (exclusive || adaptive)
-                                  ? (hend - hstart) * (wend - wstart)
-                                  : ksize_height * ksize_width;
+              if (exclusive || adaptive) {
+                pool_size = (hend - hstart) * (wend - wstart);
+              }
               pool_process.finalize(static_cast<T>(pool_size), &ele);
               output_data[ph * output_width + pw] = ele;
             }
@@ -178,19 +191,25 @@ class Pool2dFunctor<platform::CPUDeviceContext, PoolProcess, T> {
             if (adaptive) {
               hstart = AdaptStartIndex(ph, input_height, output_height);
               hend = AdaptEndIndex(ph, input_height, output_height);
-            } else {
-              hstart = ph * stride_height - padding_height;
-              hend = std::min(hstart + ksize_height, input_height);
-              hstart = std::max(hstart, 0);
             }
             for (int pw = 0; pw < output_width; ++pw) {
+              int pool_size = 1;
               if (adaptive) {
                 wstart = AdaptStartIndex(pw, input_width, output_width);
                 wend = AdaptEndIndex(pw, input_width, output_width);
               } else {
+                hstart = ph * stride_height - padding_height;
                 wstart = pw * stride_width - padding_width;
-                wend = std::min(wstart + ksize_width, input_width);
+                hend = std::min(hstart + ksize_height,
+                                input_height + padding_height);
+                wend =
+                    std::min(wstart + ksize_width, input_width + padding_width);
+                pool_size = (hend - hstart) * (wend - wstart);
+
                 wstart = std::max(wstart, 0);
+                hstart = std::max(hstart, 0);
+                hend = std::min(hend, input_height);
+                wend = std::min(wend, input_width);
               }
               T ele = pool_process.initial();
               for (int h = hstart; h < hend; ++h) {
@@ -201,10 +220,9 @@ class Pool2dFunctor<platform::CPUDeviceContext, PoolProcess, T> {
                       &ele);
                 }
               }
-              int pool_size = (exclusive || adaptive)
-                                  ? (hend - hstart) * (wend - wstart)
-                                  : ksize_height * ksize_width;
-
+              if (exclusive || adaptive) {
+                pool_size = (hend - hstart) * (wend - wstart);
+              }
               pool_process.finalize(static_cast<T>(pool_size), &ele);
               output_data[ph * output_width * output_channels +
                           pw * output_channels + c] = ele;
@@ -262,23 +280,29 @@ class Pool2dGradFunctor<platform::CPUDeviceContext, PoolProcess, T> {
           if (adaptive) {
             hstart = AdaptStartIndex(ph, input_height, output_height);
             hend = AdaptEndIndex(ph, input_height, output_height);
-          } else {
-            hstart = ph * stride_height - padding_height;
-            hend = std::min(hstart + ksize_height, input_height);
-            hstart = std::max(hstart, 0);
           }
           for (int pw = 0; pw < output_width; ++pw) {
+            int pool_size = 1;
             if (adaptive) {
               wstart = AdaptStartIndex(pw, input_width, output_width);
               wend = AdaptEndIndex(pw, input_width, output_width);
             } else {
+              hstart = ph * stride_height - padding_height;
               wstart = pw * stride_width - padding_width;
-              wend = std::min(wstart + ksize_width, input_width);
+              hend = std::min(hstart + ksize_height,
+                              input_height + padding_height);
+              wend =
+                  std::min(wstart + ksize_width, input_width + padding_width);
+              pool_size = (hend - hstart) * (wend - wstart);
+
               wstart = std::max(wstart, 0);
+              hstart = std::max(hstart, 0);
+              hend = std::min(hend, input_height);
+              wend = std::min(wend, input_width);
+            }
+            if (exclusive || adaptive) {
+              pool_size = (hend - hstart) * (wend - wstart);
             }
-            int pool_size = (exclusive || adaptive)
-                                ? (hend - hstart) * (wend - wstart)
-                                : ksize_height * ksize_width;
             float scale = 1.0 / pool_size;
             for (int h = hstart; h < hend; ++h) {
               for (int w = wstart; w < wend; ++w) {
@@ -346,23 +370,29 @@ class Pool2dGradFunctor<platform::CPUDeviceContext, PoolProcess, T> {
             if (adaptive) {
               hstart = AdaptStartIndex(ph, input_height, output_height);
               hend = AdaptEndIndex(ph, input_height, output_height);
-            } else {
-              hstart = ph * stride_height - padding_height;
-              hend = std::min(hstart + ksize_height, input_height);
-              hstart = std::max(hstart, 0);
             }
             for (int pw = 0; pw < output_width; ++pw) {
+              int pool_size = 1;
               if (adaptive) {
                 wstart = AdaptStartIndex(pw, input_width, output_width);
                 wend = AdaptEndIndex(pw, input_width, output_width);
               } else {
+                hstart = ph * stride_height - padding_height;
                 wstart = pw * stride_width - padding_width;
-                wend = std::min(wstart + ksize_width, input_width);
+                hend = std::min(hstart + ksize_height,
+                                input_height + padding_height);
+                wend =
+                    std::min(wstart + ksize_width, input_width + padding_width);
+                pool_size = (hend - hstart) * (wend - wstart);
+
                 wstart = std::max(wstart, 0);
+                hstart = std::max(hstart, 0);
+                hend = std::min(hend, input_height);
+                wend = std::min(wend, input_width);
+              }
+              if (exclusive || adaptive) {
+                pool_size = (hend - hstart) * (wend - wstart);
               }
-              int pool_size = (exclusive || adaptive)
-                                  ? (hend - hstart) * (wend - wstart)
-                                  : ksize_height * ksize_width;
               float scale = 1.0 / pool_size;
               for (int h = hstart; h < hend; ++h) {
                 for (int w = wstart; w < wend; ++w) {
@@ -391,23 +421,29 @@ class Pool2dGradFunctor<platform::CPUDeviceContext, PoolProcess, T> {
             if (adaptive) {
               hstart = AdaptStartIndex(ph, input_height, output_height);
               hend = AdaptEndIndex(ph, input_height, output_height);
-            } else {
-              hstart = ph * stride_height - padding_height;
-              hend = std::min(hstart + ksize_height, input_height);
-              hstart = std::max(hstart, 0);
             }
             for (int pw = 0; pw < output_width; ++pw) {
+              int pool_size = 1;
               if (adaptive) {
                 wstart = AdaptStartIndex(pw, input_width, output_width);
                 wend = AdaptEndIndex(pw, input_width, output_width);
               } else {
+                hstart = ph * stride_height - padding_height;
                 wstart = pw * stride_width - padding_width;
-                wend = std::min(wstart + ksize_width, input_width);
+                hend = std::min(hstart + ksize_height,
+                                input_height + padding_height);
+                wend =
+                    std::min(wstart + ksize_width, input_width + padding_width);
+                pool_size = (hend - hstart) * (wend - wstart);
+
                 wstart = std::max(wstart, 0);
+                hstart = std::max(hstart, 0);
+                hend = std::min(hend, input_height);
+                wend = std::min(wend, input_width);
+              }
+              if (exclusive || adaptive) {
+                pool_size = (hend - hstart) * (wend - wstart);
               }
-              int pool_size = (exclusive || adaptive)
-                                  ? (hend - hstart) * (wend - wstart)
-                                  : ksize_height * ksize_width;
               float scale = 1.0 / pool_size;
               for (int h = hstart; h < hend; ++h) {
                 for (int w = wstart; w < wend; ++w) {
@@ -672,34 +708,43 @@ class Pool3dFunctor<platform::CPUDeviceContext, PoolProcess, T> {
     int dstart, dend;
     int hstart, hend;
     int wstart, wend;
+
     for (int i = 0; i < batch_size; i++) {
       for (int c = 0; c < output_channels; ++c) {
         for (int pd = 0; pd < output_depth; ++pd) {
           if (adaptive) {
             dstart = AdaptStartIndex(pd, input_depth, output_depth);
             dend = AdaptEndIndex(pd, input_depth, output_depth);
-          } else {
-            dstart = pd * stride_depth - padding_depth;
-            dend = std::min(dstart + ksize_depth, input_depth);
-            dstart = std::max(dstart, 0);
           }
+
           for (int ph = 0; ph < output_height; ++ph) {
             if (adaptive) {
               hstart = AdaptStartIndex(ph, input_height, output_height);
               hend = AdaptEndIndex(ph, input_height, output_height);
-            } else {
-              hstart = ph * stride_height - padding_height;
-              hend = std::min(hstart + ksize_height, input_height);
-              hstart = std::max(hstart, 0);
             }
+
             for (int pw = 0; pw < output_width; ++pw) {
+              int pool_size = 1;
               if (adaptive) {
                 wstart = AdaptStartIndex(pw, input_width, output_width);
                 wend = AdaptEndIndex(pw, input_width, output_width);
               } else {
+                dstart = pd * stride_depth - padding_depth;
+                dend =
+                    std::min(dstart + ksize_depth, input_depth + padding_depth);
+                hstart = ph * stride_height - padding_height;
+                hend = std::min(hstart + ksize_height,
+                                input_height + padding_height);
                 wstart = pw * stride_width - padding_width;
-                wend = std::min(wstart + ksize_width, input_width);
+                wend =
+                    std::min(wstart + ksize_width, input_width + padding_width);
+                pool_size = (dend - dstart) * (hend - hstart) * (wend - wstart);
+                dstart = std::max(dstart, 0);
+                hstart = std::max(hstart, 0);
                 wstart = std::max(wstart, 0);
+                dend = std::min(dend, input_depth);
+                hend = std::min(hend, input_height);
+                wend = std::min(wend, input_width);
               }
               int output_idx = (pd * output_height + ph) * output_width + pw;
               T ele = pool_process.initial();
@@ -712,10 +757,9 @@ class Pool3dFunctor<platform::CPUDeviceContext, PoolProcess, T> {
                   }
                 }
               }
-              int pool_size =
-                  (exclusive || adaptive)
-                      ? (dend - dstart) * (hend - hstart) * (wend - wstart)
-                      : ksize_depth * ksize_height * ksize_width;
+              if (exclusive || adaptive) {
+                pool_size = (dend - dstart) * (hend - hstart) * (wend - wstart);
+              }
               pool_process.finalize(static_cast<T>(pool_size), &ele);
               output_data[output_idx] = ele;
             }
@@ -767,7 +811,6 @@ class Pool3dFunctor<platform::CPUDeviceContext, PoolProcess, T> {
     int dstart, dend;
     int hstart, hend;
     int wstart, wend;
-
     if (!channel_last) {
       const int input_stride = input_depth * input_height * input_width;
       const int output_stride = output_depth * output_height * output_width;
@@ -777,29 +820,40 @@ class Pool3dFunctor<platform::CPUDeviceContext, PoolProcess, T> {
             if (adaptive) {
               dstart = AdaptStartIndex(pd, input_depth, output_depth);
               dend = AdaptEndIndex(pd, input_depth, output_depth);
-            } else {
-              dstart = pd * stride_depth - padding_depth;
-              dend = std::min(dstart + ksize_depth, input_depth);
-              dstart = std::max(dstart, 0);
             }
+
             for (int ph = 0; ph < output_height; ++ph) {
               if (adaptive) {
                 hstart = AdaptStartIndex(ph, input_height, output_height);
                 hend = AdaptEndIndex(ph, input_height, output_height);
-              } else {
-                hstart = ph * stride_height - padding_height;
-                hend = std::min(hstart + ksize_height, input_height);
-                hstart = std::max(hstart, 0);
               }
+
               for (int pw = 0; pw < output_width; ++pw) {
+                int pool_size = 1;
                 if (adaptive) {
                   wstart = AdaptStartIndex(pw, input_width, output_width);
                   wend = AdaptEndIndex(pw, input_width, output_width);
                 } else {
+                  dstart = pd * stride_depth - padding_depth;
+                  dend = std::min(dstart + ksize_depth,
+                                  input_depth + padding_depth);
+                  hstart = ph * stride_height - padding_height;
+                  hend = std::min(hstart + ksize_height,
+                                  input_height + padding_height);
                   wstart = pw * stride_width - padding_width;
-                  wend = std::min(wstart + ksize_width, input_width);
+                  wend = std::min(wstart + ksize_width,
+                                  input_width + padding_width);
+
+                  pool_size =
+                      (dend - dstart) * (hend - hstart) * (wend - wstart);
+                  dstart = std::max(dstart, 0);
+                  hstart = std::max(hstart, 0);
                   wstart = std::max(wstart, 0);
+                  dend = std::min(dend, input_depth);
+                  hend = std::min(hend, input_height);
+                  wend = std::min(wend, input_width);
                 }
+
                 int output_idx = (pd * output_height + ph) * output_width + pw;
                 T ele = pool_process.initial();
                 for (int d = dstart; d < dend; ++d) {
@@ -811,10 +865,10 @@ class Pool3dFunctor<platform::CPUDeviceContext, PoolProcess, T> {
                     }
                   }
                 }
-                int pool_size =
-                    (exclusive || adaptive)
-                        ? (dend - dstart) * (hend - hstart) * (wend - wstart)
-                        : ksize_depth * ksize_height * ksize_width;
+                if (exclusive || adaptive) {
+                  pool_size =
+                      (dend - dstart) * (hend - hstart) * (wend - wstart);
+                }
                 pool_process.finalize(static_cast<T>(pool_size), &ele);
                 output_data[output_idx] = ele;
               }
@@ -835,28 +889,38 @@ class Pool3dFunctor<platform::CPUDeviceContext, PoolProcess, T> {
             if (adaptive) {
               dstart = AdaptStartIndex(pd, input_depth, output_depth);
               dend = AdaptEndIndex(pd, input_depth, output_depth);
-            } else {
-              dstart = pd * stride_depth - padding_depth;
-              dend = std::min(dstart + ksize_depth, input_depth);
-              dstart = std::max(dstart, 0);
             }
+
             for (int ph = 0; ph < output_height; ++ph) {
               if (adaptive) {
                 hstart = AdaptStartIndex(ph, input_height, output_height);
                 hend = AdaptEndIndex(ph, input_height, output_height);
-              } else {
-                hstart = ph * stride_height - padding_height;
-                hend = std::min(hstart + ksize_height, input_height);
-                hstart = std::max(hstart, 0);
               }
+
               for (int pw = 0; pw < output_width; ++pw) {
+                int pool_size = 1;
                 if (adaptive) {
                   wstart = AdaptStartIndex(pw, input_width, output_width);
                   wend = AdaptEndIndex(pw, input_width, output_width);
                 } else {
+                  dstart = pd * stride_depth - padding_depth;
+                  dend = std::min(dstart + ksize_depth,
+                                  input_depth + padding_depth);
+                  hstart = ph * stride_height - padding_height;
+                  hend = std::min(hstart + ksize_height,
+                                  input_height + padding_height);
                   wstart = pw * stride_width - padding_width;
-                  wend = std::min(wstart + ksize_width, input_width);
+                  wend = std::min(wstart + ksize_width,
+                                  input_width + padding_width);
+
+                  pool_size =
+                      (dend - dstart) * (hend - hstart) * (wend - wstart);
+                  dstart = std::max(dstart, 0);
+                  hstart = std::max(hstart, 0);
                   wstart = std::max(wstart, 0);
+                  dend = std::min(dend, input_depth);
+                  hend = std::min(hend, input_height);
+                  wend = std::min(wend, input_width);
                 }
 
                 T ele = pool_process.initial();
@@ -871,10 +935,10 @@ class Pool3dFunctor<platform::CPUDeviceContext, PoolProcess, T> {
                     }
                   }
                 }
-                int pool_size =
-                    (exclusive || adaptive)
-                        ? (dend - dstart) * (hend - hstart) * (wend - wstart)
-                        : ksize_depth * ksize_height * ksize_width;
+                if (exclusive || adaptive) {
+                  pool_size =
+                      (dend - dstart) * (hend - hstart) * (wend - wstart);
+                }
                 pool_process.finalize(static_cast<T>(pool_size), &ele);
                 int output_idx =
                     ((pd * output_height + ph) * output_width + pw) *
@@ -943,34 +1007,42 @@ class Pool3dGradFunctor<platform::CPUDeviceContext, PoolProcess, T> {
           if (adaptive) {
             dstart = AdaptStartIndex(pd, input_depth, output_depth);
             dend = AdaptEndIndex(pd, input_depth, output_depth);
-          } else {
-            dstart = pd * stride_depth - padding_depth;
-            dend = std::min(dstart + ksize_depth, input_depth);
-            dstart = std::max(dstart, 0);
           }
+
           for (int ph = 0; ph < output_height; ++ph) {
             if (adaptive) {
               hstart = AdaptStartIndex(ph, input_height, output_height);
               hend = AdaptEndIndex(ph, input_height, output_height);
-            } else {
-              hstart = ph * stride_height - padding_height;
-              hend = std::min(hstart + ksize_height, input_height);
-              hstart = std::max(hstart, 0);
             }
+
             for (int pw = 0; pw < output_width; ++pw) {
+              int pool_size = 1;
               if (adaptive) {
                 wstart = AdaptStartIndex(pw, input_width, output_width);
                 wend = AdaptEndIndex(pw, input_width, output_width);
               } else {
+                dstart = pd * stride_depth - padding_depth;
+                dend =
+                    std::min(dstart + ksize_depth, input_depth + padding_depth);
+                hstart = ph * stride_height - padding_height;
+                hend = std::min(hstart + ksize_height,
+                                input_height + padding_height);
                 wstart = pw * stride_width - padding_width;
-                wend = std::min(wstart + ksize_width, input_width);
+                wend =
+                    std::min(wstart + ksize_width, input_width + padding_width);
+
+                pool_size = (dend - dstart) * (hend - hstart) * (wend - wstart);
+                dstart = std::max(dstart, 0);
+                hstart = std::max(hstart, 0);
                 wstart = std::max(wstart, 0);
+                dend = std::min(dend, input_depth);
+                hend = std::min(hend, input_height);
+                wend = std::min(wend, input_width);
               }
 
-              int pool_size =
-                  (exclusive || adaptive)
-                      ? (dend - dstart) * (hend - hstart) * (wend - wstart)
-                      : ksize_depth * ksize_height * ksize_width;
+              if (exclusive || adaptive) {
+                pool_size = (dend - dstart) * (hend - hstart) * (wend - wstart);
+              }
               float scale = 1.0 / pool_size;
               for (int d = dstart; d < dend; ++d) {
                 for (int h = hstart; h < hend; ++h) {
@@ -1046,34 +1118,44 @@ class Pool3dGradFunctor<platform::CPUDeviceContext, PoolProcess, T> {
             if (adaptive) {
               dstart = AdaptStartIndex(pd, input_depth, output_depth);
               dend = AdaptEndIndex(pd, input_depth, output_depth);
-            } else {
-              dstart = pd * stride_depth - padding_depth;
-              dend = std::min(dstart + ksize_depth, input_depth);
-              dstart = std::max(dstart, 0);
             }
+
             for (int ph = 0; ph < output_height; ++ph) {
               if (adaptive) {
                 hstart = AdaptStartIndex(ph, input_height, output_height);
                 hend = AdaptEndIndex(ph, input_height, output_height);
-              } else {
-                hstart = ph * stride_height - padding_height;
-                hend = std::min(hstart + ksize_height, input_height);
-                hstart = std::max(hstart, 0);
               }
+
               for (int pw = 0; pw < output_width; ++pw) {
+                int pool_size = 1;
                 if (adaptive) {
                   wstart = AdaptStartIndex(pw, input_width, output_width);
                   wend = AdaptEndIndex(pw, input_width, output_width);
                 } else {
+                  dstart = pd * stride_depth - padding_depth;
+                  dend = std::min(dstart + ksize_depth,
+                                  input_depth + padding_depth);
+                  hstart = ph * stride_height - padding_height;
+                  hend = std::min(hstart + ksize_height,
+                                  input_height + padding_height);
                   wstart = pw * stride_width - padding_width;
-                  wend = std::min(wstart + ksize_width, input_width);
+                  wend = std::min(wstart + ksize_width,
+                                  input_width + padding_width);
+
+                  pool_size =
+                      (dend - dstart) * (hend - hstart) * (wend - wstart);
+                  dstart = std::max(dstart, 0);
+                  hstart = std::max(hstart, 0);
                   wstart = std::max(wstart, 0);
+                  dend = std::min(dend, input_depth);
+                  hend = std::min(hend, input_height);
+                  wend = std::min(wend, input_width);
                 }
 
-                int pool_size =
-                    (exclusive || adaptive)
-                        ? (dend - dstart) * (hend - hstart) * (wend - wstart)
-                        : ksize_depth * ksize_height * ksize_width;
+                if (exclusive || adaptive) {
+                  pool_size =
+                      (dend - dstart) * (hend - hstart) * (wend - wstart);
+                }
                 float scale = 1.0 / pool_size;
                 for (int d = dstart; d < dend; ++d) {
                   for (int h = hstart; h < hend; ++h) {
@@ -1108,34 +1190,44 @@ class Pool3dGradFunctor<platform::CPUDeviceContext, PoolProcess, T> {
             if (adaptive) {
               dstart = AdaptStartIndex(pd, input_depth, output_depth);
               dend = AdaptEndIndex(pd, input_depth, output_depth);
-            } else {
-              dstart = pd * stride_depth - padding_depth;
-              dend = std::min(dstart + ksize_depth, input_depth);
-              dstart = std::max(dstart, 0);
             }
+
             for (int ph = 0; ph < output_height; ++ph) {
               if (adaptive) {
                 hstart = AdaptStartIndex(ph, input_height, output_height);
                 hend = AdaptEndIndex(ph, input_height, output_height);
-              } else {
-                hstart = ph * stride_height - padding_height;
-                hend = std::min(hstart + ksize_height, input_height);
-                hstart = std::max(hstart, 0);
               }
+
               for (int pw = 0; pw < output_width; ++pw) {
+                int pool_size = 1;
                 if (adaptive) {
                   wstart = AdaptStartIndex(pw, input_width, output_width);
                   wend = AdaptEndIndex(pw, input_width, output_width);
                 } else {
+                  dstart = pd * stride_depth - padding_depth;
+                  dend = std::min(dstart + ksize_depth,
+                                  input_depth + padding_depth);
+                  hstart = ph * stride_height - padding_height;
+                  hend = std::min(hstart + ksize_height,
+                                  input_height + padding_height);
                   wstart = pw * stride_width - padding_width;
-                  wend = std::min(wstart + ksize_width, input_width);
+                  wend = std::min(wstart + ksize_width,
+                                  input_width + padding_width);
+
+                  pool_size =
+                      (dend - dstart) * (hend - hstart) * (wend - wstart);
+                  dstart = std::max(dstart, 0);
+                  hstart = std::max(hstart, 0);
                   wstart = std::max(wstart, 0);
+                  dend = std::min(dend, input_depth);
+                  hend = std::min(hend, input_height);
+                  wend = std::min(wend, input_width);
                 }
 
-                int pool_size =
-                    (exclusive || adaptive)
-                        ? (dend - dstart) * (hend - hstart) * (wend - wstart)
-                        : ksize_depth * ksize_height * ksize_width;
+                if (exclusive || adaptive) {
+                  pool_size =
+                      (dend - dstart) * (hend - hstart) * (wend - wstart);
+                }
                 float scale = 1.0 / pool_size;
                 for (int d = dstart; d < dend; ++d) {
                   for (int h = hstart; h < hend; ++h) {
diff --git a/paddle/fluid/operators/matmul_op.cc b/paddle/fluid/operators/matmul_op.cc
index 809164df2056c..129298edafcf9 100644
--- a/paddle/fluid/operators/matmul_op.cc
+++ b/paddle/fluid/operators/matmul_op.cc
@@ -348,6 +348,181 @@ framework::DDim GetDimForInput(const framework::InferShapeContext &ctx,
   return dim;
 }
 
+template <typename DeviceContext, typename T>
+class MatMulDoubleGradKernel : public framework::OpKernel<T> {
+ public:
+  void MatMul(const framework::ExecutionContext &context,
+              const framework::Tensor &a, bool trans_a,
+              const framework::Tensor &b, bool trans_b, bool flag,
+              framework::Tensor *out) const {
+    out->mutable_data<T>(context.GetPlace());
+    auto blas = math::GetBlas<DeviceContext, T>(context);
+    auto mat_dim_a = math::CreateMatrixDescriptor(a.dims(), 0, trans_a);
+    auto mat_dim_b = math::CreateMatrixDescriptor(b.dims(), 0, trans_b);
+
+    int head_number = 1;
+#if defined(PADDLE_WITH_MKLML) && !defined(PADDLE_WITH_CUDA)
+    head_number = context.Attr<int>("head_number");
+#endif
+
+    if (head_number <= 1 && a.dims().size() == 3 && b.dims().size() <= 2) {
+      // the transpose_X must be false, if is true, the transpose cost much time
+      if (!trans_a) {
+        mat_dim_a.height_ *= mat_dim_a.batch_size_;
+        mat_dim_a.batch_size_ = 0;
+      }
+    }
+    blas.MatMul(a, mat_dim_a, b, mat_dim_b,
+                static_cast<T>(context.Attr<float>("alpha")), out,
+                static_cast<T>(flag));
+  }
+
+  void CalcInputGrad(const framework::ExecutionContext &context,
+                     const framework::Tensor &a, bool trans_a,
+                     bool is_fold_init_dims_a, const framework::Tensor &b,
+                     bool trans_b, bool is_fold_init_dims_b, bool flag,
+                     framework::Tensor *out) const {
+    if (out == nullptr) return;
+    bool need_combine = (a.dims().size() == 3 || b.dims().size() == 3) &&
+                        out->dims().size() == 2;
+    if (!need_combine) {
+      MatMul(context, a, trans_a, b, trans_b, flag, out);
+    } else {
+      auto &ctx = context.template device_context<DeviceContext>();
+      MatMul(context, is_fold_init_dims_a
+                          ? FoldInitDims(a)
+                          : FoldHeadAndLastDims<DeviceContext, T>(ctx, a),
+             trans_a, is_fold_init_dims_b
+                          ? FoldInitDims(b)
+                          : FoldHeadAndLastDims<DeviceContext, T>(ctx, b),
+             trans_b, flag, out);
+    }
+  }
+
+  void Compute(const framework::ExecutionContext &context) const override {
+    auto x = *context.Input<framework::Tensor>("X");
+    auto y = *context.Input<framework::Tensor>("Y");
+    auto dout = *context.Input<framework::LoDTensor>("DOut");
+    auto *ddx = context.Input<framework::LoDTensor>("DDX");
+    auto *ddy = context.Input<framework::LoDTensor>("DDY");
+
+    auto *dx = context.Output<framework::LoDTensor>("DX");
+    auto *dy = context.Output<framework::LoDTensor>("DY");
+    auto *ddout = context.Output<framework::LoDTensor>("DDOut");
+
+    bool transpose_x = context.Attr<bool>("transpose_X");
+    bool transpose_y = context.Attr<bool>("transpose_Y");
+
+    ReshapeXYOutIntoMatrixSequence(&x, &y, &dout, transpose_x, transpose_y);
+
+    framework::DDim dx_dims;
+    if (dx) {
+      dx_dims = dx->dims();
+      if (dx_dims != x.dims()) {
+        dx->Resize(x.dims());
+      }
+    }
+
+    framework::DDim dy_dims;
+    if (dy) {
+      dy_dims = dy->dims();
+      if (dy_dims != y.dims()) {
+        dy->Resize(y.dims());
+      }
+    }
+
+    framework::DDim ddout_dims;
+    if (ddout) {
+      ddout_dims = ddout->dims();
+      if (ddout_dims != dout.dims()) {
+        ddout->Resize(dout.dims());
+      }
+    }
+
+    bool ddout_flag = false;
+    if (ddx) {
+      auto ddx_mat = *ddx;
+      if (ddx_mat.dims() != x.dims()) {
+        ddx_mat.Resize(x.dims());
+      }
+      if (dy) {
+        if (transpose_x && transpose_y) {
+          // dy = dout' * ddx'
+          CalcInputGrad(context, dout, true, true, ddx_mat, true, false, false,
+                        dy);
+        } else if (transpose_x) {
+          // dy = ddx * dout
+          CalcInputGrad(context, ddx_mat, false, false, dout, false, true,
+                        false, dy);
+        } else if (transpose_y) {
+          // dy = dout' * ddx
+          CalcInputGrad(context, dout, true, true, ddx_mat, false, true, false,
+                        dy);
+        } else {
+          // dy = ddx' * dout
+          CalcInputGrad(context, ddx_mat, true, true, dout, false, true, false,
+                        dy);
+        }
+      }
+
+      if (ddout) {
+        CalcInputGrad(context, ddx_mat, transpose_x, true, y, transpose_y,
+                      false, ddout_flag, ddout);
+        ddout_flag = true;
+      }
+    }
+
+    if (ddy) {
+      auto ddy_mat = *ddy;
+      if (ddy_mat.dims() != y.dims()) {
+        ddy_mat.Resize(y.dims());
+      }
+      if (dx) {
+        if (transpose_x && transpose_y) {
+          // dx = ddy' * dout'
+          CalcInputGrad(context, ddy_mat, true, true, dout, true, false, false,
+                        dx);
+        } else if (transpose_x) {
+          // dx = ddy * dout'
+          CalcInputGrad(context, ddy_mat, false, false, dout, true, false,
+                        false, dx);
+        } else if (transpose_y) {
+          // dx = dout * ddy
+          CalcInputGrad(context, dout, false, false, ddy_mat, false, true,
+                        false, dx);
+        } else {
+          // dx = dout * ddy'
+          CalcInputGrad(context, dout, false, false, ddy_mat, true, false,
+                        false, dx);
+        }
+      }
+
+      if (ddout) {
+        CalcInputGrad(context, x, transpose_x, true, ddy_mat, transpose_y,
+                      false, ddout_flag, ddout);
+      }
+    }
+
+    if (dx) {
+      if (dx_dims != x.dims()) {
+        dx->Resize(dx_dims);
+      }
+    }
+
+    if (dy) {
+      if (dy_dims != y.dims()) {
+        dy->Resize(dy_dims);
+      }
+    }
+
+    if (ddout) {
+      if (ddout_dims != dout.dims()) {
+        ddout->Resize(ddout_dims);
+      }
+    }
+  }
+};
+
 class MatMulOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
@@ -647,6 +822,61 @@ class MatMulOpGradMaker : public framework::SingleGradOpMaker<T> {
     retv->SetAttrMap(this->Attrs());
   }
 };
+
+class MatMulOpDoubleGrad : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+ protected:
+  void InferShape(framework::InferShapeContext *context) const override {
+    OP_INOUT_CHECK(context->HasInput("X"), "Input", "X", "matmul");
+    OP_INOUT_CHECK(context->HasInput("Y"), "Input", "Y", "matmul");
+    OP_INOUT_CHECK(context->HasInput("DOut"), "Input", "DOut", "matmul");
+
+    if (context->HasOutput("DX") && context->HasInput("DDY")) {
+      context->ShareDim("X", "DX");
+    }
+
+    if (context->HasOutput("DY") && context->HasInput("DDX")) {
+      context->ShareDim("Y", "DY");
+    }
+
+    if (context->HasOutput("DDOut") &&
+        (context->HasInput("DDY") || context->HasInput("DDX"))) {
+      context->ShareDim("DOut", "DDOut");
+    }
+  }
+};
+
+template <typename T>
+class MatMulOpDoubleGradMaker : public framework::SingleGradOpMaker<T> {
+ public:
+  using framework::SingleGradOpMaker<T>::SingleGradOpMaker;
+
+ protected:
+  void Apply(GradOpPtr<T> retv) const override {
+    retv->SetType("matmul_grad_grad");
+    retv->SetInput("X", this->Input("X"));
+    retv->SetInput("Y", this->Input("Y"));
+    retv->SetInput("DOut", this->Input(framework::GradVarName("Out")));
+    retv->SetInput("DDX", this->OutputGrad(framework::GradVarName("X")));
+    retv->SetInput("DDY", this->OutputGrad(framework::GradVarName("Y")));
+
+    auto ddx = this->OutputGrad(framework::GradVarName("X"));
+    auto ddy = this->OutputGrad(framework::GradVarName("Y"));
+
+    if (!ddx.empty() || !ddy.empty()) {
+      retv->SetOutput("DDOut", this->InputGrad(framework::GradVarName("Out")));
+    }
+    retv->SetOutput(
+        "DX", ddy.empty() ? this->EmptyInputGrad() : this->InputGrad("X"));
+    retv->SetOutput(
+        "DY", ddx.empty() ? this->EmptyInputGrad() : this->InputGrad("Y"));
+
+    retv->SetAttrMap(this->Attrs());
+  }
+};
+
 }  // namespace operators
 }  // namespace paddle
 
@@ -654,7 +884,10 @@ namespace ops = paddle::operators;
 REGISTER_OPERATOR(matmul, ops::MatMulOp, ops::MatMulOpMaker,
                   ops::MatMulOpGradMaker<paddle::framework::OpDesc>,
                   ops::MatMulOpGradMaker<paddle::imperative::OpBase>);
-REGISTER_OPERATOR(matmul_grad, ops::MatMulOpGrad);
+REGISTER_OPERATOR(matmul_grad, ops::MatMulOpGrad,
+                  ops::MatMulOpDoubleGradMaker<paddle::framework::OpDesc>,
+                  ops::MatMulOpDoubleGradMaker<paddle::imperative::OpBase>);
+REGISTER_OPERATOR(matmul_grad_grad, ops::MatMulOpDoubleGrad);
 REGISTER_OP_CPU_KERNEL(
     matmul, ops::MatMulKernel<paddle::platform::CPUDeviceContext, float>,
     ops::MatMulKernel<paddle::platform::CPUDeviceContext, double>);
@@ -663,6 +896,11 @@ REGISTER_OP_CPU_KERNEL(
     ops::MatMulGradKernel<paddle::platform::CPUDeviceContext, float>,
     ops::MatMulGradKernel<paddle::platform::CPUDeviceContext, double>);
 
+REGISTER_OP_CPU_KERNEL(
+    matmul_grad_grad,
+    ops::MatMulDoubleGradKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::MatMulDoubleGradKernel<paddle::platform::CPUDeviceContext, double>);
+
 #ifdef PADDLE_WITH_CUDA
 REGISTER_OP_CUDA_KERNEL(
     matmul, ops::MatMulKernel<paddle::platform::CUDADeviceContext, float>,
@@ -675,4 +913,8 @@ REGISTER_OP_CUDA_KERNEL(
     ops::MatMulGradKernel<paddle::platform::CUDADeviceContext, double>,
     ops::MatMulGradKernel<paddle::platform::CUDADeviceContext,
                           paddle::platform::float16>);
+REGISTER_OP_CUDA_KERNEL(
+    matmul_grad_grad,
+    ops::MatMulDoubleGradKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::MatMulDoubleGradKernel<paddle::platform::CUDADeviceContext, double>);
 #endif
diff --git a/paddle/fluid/operators/maxout_op.cc b/paddle/fluid/operators/maxout_op.cc
index 7db2e9421b5ca..6d8d18a3d126e 100644
--- a/paddle/fluid/operators/maxout_op.cc
+++ b/paddle/fluid/operators/maxout_op.cc
@@ -83,6 +83,18 @@ class MaxOutOp : public framework::OperatorWithKernel {
                                      "Attr(groups) of Op(maxout) should be "
                                      "larger than 1. But received %d.",
                                      groups));
+    PADDLE_ENFORCE_EQ(
+        axis == 1 || axis == -1 || axis == 3, true,
+        platform::errors::InvalidArgument(
+            "axis only supported 1, -1 or 3, but recevied axis is: %d", axis));
+    PADDLE_ENFORCE_EQ(in_x_dims.size(), 4,
+                      platform::errors::InvalidArgument(
+                          "x's dims should be 4, but received x's dims is: %d",
+                          in_x_dims.size()));
+
+    if (axis < 0) {
+      axis += in_x_dims.size();
+    }
     PADDLE_ENFORCE_EQ(
         in_x_dims[axis] % groups, 0,
         platform::errors::InvalidArgument(
diff --git a/paddle/fluid/operators/maxout_op.h b/paddle/fluid/operators/maxout_op.h
index ec3897e4044ad..64b538fc5d5bd 100644
--- a/paddle/fluid/operators/maxout_op.h
+++ b/paddle/fluid/operators/maxout_op.h
@@ -31,6 +31,9 @@ class MaxOutKernel : public framework::OpKernel<T> {
     Tensor* out = context.Output<Tensor>("Out");
     int groups = context.template Attr<int>("groups");
     int axis = context.template Attr<int>("axis");
+    if (axis < 0) {
+      axis += in_x->dims().size();
+    }
 
     math::MaxOutFunctor<DeviceContext, T> maxout_forward;
     maxout_forward(context.template device_context<DeviceContext>(), *in_x, out,
@@ -49,6 +52,10 @@ class MaxOutGradKernel : public framework::OpKernel<T> {
     Tensor* in_x_grad = context.Output<Tensor>(framework::GradVarName("X"));
     int groups = context.template Attr<int>("groups");
     int axis = context.template Attr<int>("axis");
+    if (axis < 0) {
+      axis += in_x->dims().size();
+    }
+
     auto& device_ctx = context.template device_context<DeviceContext>();
     math::SetConstant<DeviceContext, T> zero;
     if (in_x_grad) {
diff --git a/paddle/fluid/operators/metrics/accuracy_op_xpu.cc b/paddle/fluid/operators/metrics/accuracy_op_xpu.cc
new file mode 100644
index 0000000000000..c0aa00e79341e
--- /dev/null
+++ b/paddle/fluid/operators/metrics/accuracy_op_xpu.cc
@@ -0,0 +1,120 @@
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef PADDLE_WITH_XPU
+
+#include "paddle/fluid/operators/metrics/accuracy_op.h"
+#include "paddle/fluid/platform/xpu_header.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename DeviceContext, typename T>
+class AccuracyXPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* inference = ctx.Input<Tensor>("Out");
+    auto* indices = ctx.Input<Tensor>("Indices");
+    auto* label = ctx.Input<Tensor>("Label");
+    auto* accuracy = ctx.Output<Tensor>("Accuracy");
+    auto* correct = ctx.Output<Tensor>("Correct");
+    auto* total = ctx.Output<Tensor>("Total");
+    int* correct_data = correct->mutable_data<int>(ctx.GetPlace());
+    int* total_data = total->mutable_data<int>(ctx.GetPlace());
+    float* accuracy_data = accuracy->mutable_data<float>(ctx.GetPlace());
+    const int64_t* indices_data = indices->data<int64_t>();
+    const int64_t* label_data = label->data<int64_t>();
+    size_t num_samples = inference->dims()[0];
+    size_t class_dim = inference->dims()[1];
+    if (num_samples == 0) {
+      return;
+    }
+    size_t indices_int32_size = num_samples * class_dim * sizeof(int);
+    size_t indices_int64_size = num_samples * class_dim * sizeof(int64_t);
+    size_t label_int32_size = num_samples * sizeof(int);
+    size_t label_int64_size = num_samples * sizeof(int64_t);
+    auto& dev_ctx = ctx.template device_context<DeviceContext>();
+    int* indices_int32_device = NULL;
+    PADDLE_ENFORCE_EQ(
+        xpu_malloc(reinterpret_cast<void**>(&indices_int32_device),
+                   indices_int32_size),
+        XPU_SUCCESS,
+        platform::errors::ResourceExhausted(
+            "\n\nOut of memory error on XPU, Cannot allocate %s memory"
+            " on XPU. \n\nPlease check whether there is any other process "
+            "using XPU.\n",
+            string::HumanReadableSize(indices_int32_size)));
+    int* label_int32_device = NULL;
+    PADDLE_ENFORCE_EQ(
+        xpu_malloc(reinterpret_cast<void**>(&label_int32_device),
+                   label_int32_size),
+        XPU_SUCCESS,
+        platform::errors::ResourceExhausted(
+            "\n\nOut of memory error on XPU, Cannot allocate %s memory"
+            " on XPU. \n\nPlease check whether there is any other process "
+            "using XPU.\n",
+            string::HumanReadableSize(label_int32_size)));
+
+    int* indices_int32_host =
+        reinterpret_cast<int*>(std::malloc(indices_int32_size));
+    int64_t* indices_int64_host =
+        reinterpret_cast<int64_t*>(std::malloc(indices_int64_size));
+    int* label_int32_host =
+        reinterpret_cast<int*>(std::malloc(label_int32_size));
+    int64_t* label_int64_host =
+        reinterpret_cast<int64_t*>(std::malloc(label_int64_size));
+    dev_ctx.Wait();
+    memory::Copy(platform::CPUPlace(), indices_int64_host,
+                 BOOST_GET_CONST(platform::XPUPlace, ctx.GetPlace()),
+                 indices_data, indices_int64_size);
+    memory::Copy(platform::CPUPlace(), label_int64_host,
+                 BOOST_GET_CONST(platform::XPUPlace, ctx.GetPlace()),
+                 label_data, label_int64_size);
+    for (int i = 0; i < num_samples; ++i) {
+      label_int32_host[i] = label_int64_host[i];
+      for (int j = 0; j < class_dim; ++j) {
+        indices_int32_host[i * class_dim + j] =
+            indices_int64_host[i * class_dim + j];
+      }
+    }
+    memory::Copy(BOOST_GET_CONST(platform::XPUPlace, ctx.GetPlace()),
+                 indices_int32_device, platform::CPUPlace(), indices_int32_host,
+                 indices_int32_size);
+    memory::Copy(BOOST_GET_CONST(platform::XPUPlace, ctx.GetPlace()),
+                 label_int32_device, platform::CPUPlace(), label_int32_host,
+                 label_int32_size);
+    int r = xpu::accuracy(dev_ctx.x_context(), indices_int32_device,
+                          label_int32_device, num_samples, class_dim,
+                          correct_data, total_data, accuracy_data);
+    PADDLE_ENFORCE_EQ(r, xpu::Error_t::SUCCESS,
+                      platform::errors::Fatal("XPU kernel error!"));
+    dev_ctx.Wait();
+    xpu_free(indices_int32_device);
+    xpu_free(label_int32_device);
+    std::free(indices_int32_host);
+    std::free(indices_int64_host);
+    std::free(label_int32_host);
+    std::free(label_int64_host);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP_XPU_KERNEL(
+    accuracy,
+    ops::AccuracyXPUKernel<paddle::platform::XPUDeviceContext, float>);
+
+#endif
diff --git a/paddle/fluid/operators/mkldnn/pool_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/pool_mkldnn_op.cc
index bf12c61a4d9b1..72d2f779f800b 100644
--- a/paddle/fluid/operators/mkldnn/pool_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/pool_mkldnn_op.cc
@@ -126,6 +126,9 @@ class PoolMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> {
     UpdatePadding(&paddings, global_pooling, 0, padding_algorithm, data_dims,
                   strides, ksize);
 
+    platform::PoolingMKLDNNHandler<T>::ComputeAdaptivePoolParameters(
+        ctx, paddle::framework::vectorize(in_x->dims()), ksize, strides);
+
     auto& dev_ctx =
         ctx.template device_context<platform::MKLDNNDeviceContext>();
 
diff --git a/paddle/fluid/operators/reshape_op.cc b/paddle/fluid/operators/reshape_op.cc
index e03824ca8c3f4..05bb37ee421ff 100644
--- a/paddle/fluid/operators/reshape_op.cc
+++ b/paddle/fluid/operators/reshape_op.cc
@@ -49,7 +49,8 @@ inline std::vector<int> get_new_shape(
             "the element's shape must be [1]. But received the element's shape "
             "is [%s]",
             tensor->dims()));
-    if (platform::is_gpu_place(tensor->place())) {
+    if (platform::is_gpu_place(tensor->place()) ||
+        platform::is_xpu_place(tensor->place())) {
       framework::Tensor temp;
       TensorCopySync(*tensor, platform::CPUPlace(), &temp);
 
@@ -362,7 +363,8 @@ class ReshapeKernel {
       if (shape_tensor) {
         auto *shape_data = shape_tensor->data<int>();
         framework::Tensor cpu_shape_tensor;
-        if (platform::is_gpu_place(shape_tensor->place())) {
+        if (platform::is_gpu_place(shape_tensor->place()) ||
+            platform::is_xpu_place(shape_tensor->place())) {
           TensorCopySync(*shape_tensor, platform::CPUPlace(),
                          &cpu_shape_tensor);
           shape_data = cpu_shape_tensor.data<int>();
@@ -375,9 +377,22 @@ class ReshapeKernel {
 
     out->Resize(out_dims);
     out->mutable_data(ctx.GetPlace(), in->type());
-    framework::TensorCopy(
-        *in, ctx.GetPlace(),
-        ctx.template device_context<platform::DeviceContext>(), out);
+
+#ifdef PADDLE_WITH_XPU
+    if (platform::is_xpu_place(ctx.GetPlace())) {
+      auto &dev_ctx =
+          ctx.template device_context<paddle::platform::XPUDeviceContext>();
+      xpu::memcpy_device(
+          dev_ctx.x_context(), out->data<void>(), in->data<void>(),
+          in->numel() * paddle::framework::SizeOfType(in->type()));
+    } else {
+#endif
+      framework::TensorCopy(
+          *in, ctx.GetPlace(),
+          ctx.template device_context<platform::DeviceContext>(), out);
+#ifdef PADDLE_WITH_XPU
+    }
+#endif
     out->Resize(out_dims);
   }
 };
@@ -644,3 +659,15 @@ REGISTER_OP_CUDA_KERNEL_FUNCTOR(reshape2_grad_grad, float,
                                 ops::ReshapeDoubleGradKernel, plat::float16,
                                 ops::ReshapeDoubleGradKernel);
 #endif
+
+#ifdef PADDLE_WITH_XPU
+REGISTER_OP_XPU_KERNEL_FUNCTOR(reshape2, float, ops::ReshapeKernel, double,
+                               ops::ReshapeKernel, int, ops::ReshapeKernel,
+                               int64_t, ops::ReshapeKernel, plat::float16,
+                               ops::ReshapeKernel);
+REGISTER_OP_XPU_KERNEL_FUNCTOR(reshape2_grad, float, ops::ReshapeGradKernel,
+                               double, ops::ReshapeGradKernel, int,
+                               ops::ReshapeGradKernel, int64_t,
+                               ops::ReshapeGradKernel, plat::float16,
+                               ops::ReshapeGradKernel);
+#endif
diff --git a/paddle/fluid/operators/scale_op_xpu.cc b/paddle/fluid/operators/scale_op_xpu.cc
new file mode 100644
index 0000000000000..4002be8100152
--- /dev/null
+++ b/paddle/fluid/operators/scale_op_xpu.cc
@@ -0,0 +1,63 @@
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef PADDLE_WITH_XPU
+
+#include "paddle/fluid/operators/scale_op.h"
+#include <string>
+#include "paddle/fluid/platform/xpu_header.h"
+
+namespace paddle {
+namespace operators {
+template <typename DeviceContext, typename T>
+class ScaleXPUKernel : public framework::OpKernel<T> {
+ public:
+  virtual void Compute(const framework::ExecutionContext& ctx) const {
+    auto* in_var = ctx.InputVar("X");
+    auto* in = framework::GetLoDTensorOrSelectedRowsValueFromVar(*in_var);
+    auto scale = static_cast<T>(ctx.Attr<float>("scale"));
+    auto bias = static_cast<T>(ctx.Attr<float>("bias"));
+    auto bias_after_scale = ctx.Attr<bool>("bias_after_scale");
+    auto* out_var = ctx.OutputVar("Out");
+    if (in_var->IsType<framework::SelectedRows>() && in_var != out_var) {
+      auto& in_slr = in_var->Get<framework::SelectedRows>();
+      auto* out_slr = out_var->GetMutable<framework::SelectedRows>();
+      out_slr->set_rows(in_slr.rows());
+      out_slr->set_height(in_slr.height());
+    }
+    auto* out =
+        framework::GetMutableLoDTensorOrSelectedRowsValueFromVar(out_var);
+    out->mutable_data<T>(in->place());
+    PADDLE_ENFORCE_EQ(
+        in->dims(), out->dims(),
+        platform::errors::InvalidArgument("In and out should have the same dim,"
+                                          " expected %s, but got %s.",
+                                          in->dims().to_str().c_str(),
+                                          out->dims().to_str().c_str()));
+    auto& dev_ctx = ctx.template device_context<DeviceContext>();
+    int r = xpu::scale(dev_ctx.x_context(), in->numel(), scale, bias,
+                       bias_after_scale, in->data<float>(), out->data<float>());
+    PADDLE_ENFORCE_EQ(r, xpu::Error_t::SUCCESS,
+                      platform::errors::Fatal("XPU kernel error!"));
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP_XPU_KERNEL(
+    scale, ops::ScaleXPUKernel<paddle::platform::XPUDeviceContext, float>);
+
+#endif
diff --git a/paddle/fluid/operators/shape_op_xpu.cc b/paddle/fluid/operators/shape_op_xpu.cc
new file mode 100644
index 0000000000000..2e9092a643253
--- /dev/null
+++ b/paddle/fluid/operators/shape_op_xpu.cc
@@ -0,0 +1,21 @@
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *     Unless required by applicable law or agreed to in writing, software
+ *     distributed under the License is distributed on an "AS IS" BASIS,
+ *     WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *     See the License for the specific language governing permissions and
+ *     limitations under the License. */
+
+#ifdef PADDLE_WITH_XPU
+
+#include "paddle/fluid/operators/shape_op.h"
+
+namespace ops = paddle::operators;
+REGISTER_OP_XPU_KERNEL(shape, ops::ShapeKernel<bool>, ops::ShapeKernel<int>,
+                       ops::ShapeKernel<int64_t>, ops::ShapeKernel<float>,
+                       ops::ShapeKernel<double>);
+
+#endif
diff --git a/paddle/fluid/operators/sign_op_xpu.cc b/paddle/fluid/operators/sign_op_xpu.cc
new file mode 100644
index 0000000000000..44fd555544e7f
--- /dev/null
+++ b/paddle/fluid/operators/sign_op_xpu.cc
@@ -0,0 +1,44 @@
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef PADDLE_WITH_XPU
+
+#include "paddle/fluid/operators/sign_op.h"
+#include "paddle/fluid/platform/xpu_header.h"
+namespace paddle {
+namespace operators {
+
+template <typename DeviceContext, typename T>
+class SignXPUKernel : public framework::OpKernel<T> {
+ public:
+  virtual void Compute(const framework::ExecutionContext& context) const {
+    auto* out = context.Output<framework::Tensor>("Out");
+    auto* in = context.Input<framework::Tensor>("X");
+    out->mutable_data<T>(in->place());
+    auto xpu_context = context.device_context<DeviceContext>().x_context();
+    int r = xpu::activation_forward(xpu_context, xpu::Activation_t::SIGN,
+                                    in->numel(), in->data<T>(), out->data<T>());
+    PADDLE_ENFORCE_EQ(r, xpu::Error_t::SUCCESS,
+                      platform::errors::Fatal("XPU kernel error!"));
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP_XPU_KERNEL(
+    sign, ops::SignXPUKernel<paddle::platform::XPUDeviceContext, float>);
+
+#endif
diff --git a/paddle/fluid/operators/softmax_op_xpu.cc b/paddle/fluid/operators/softmax_op_xpu.cc
new file mode 100644
index 0000000000000..29740000aeb4c
--- /dev/null
+++ b/paddle/fluid/operators/softmax_op_xpu.cc
@@ -0,0 +1,99 @@
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef PADDLE_WITH_XPU
+
+#include "paddle/fluid/operators/softmax_op.h"
+#include "paddle/fluid/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+using DDim = framework::DDim;
+
+template <typename DeviceContext, typename T>
+class SoftmaxXPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto* x = context.Input<Tensor>("X");
+    auto* out = context.Output<Tensor>("Out");
+    const int rank = x->dims().size();
+    const int axis = CanonicalAxis(context.Attr<int>("axis"), rank);
+    PADDLE_ENFORCE_EQ(axis == -1 || axis == rank - 1, true,
+                      platform::errors::InvalidArgument(
+                          "xpu softmax kernel only support last dimension of x "
+                          "(axis==-1 or axis==x_dims-1), but received axis: "
+                          "%d, x's shape: %s.",
+                          axis, x->dims()));
+
+    // allocate memory on device.
+    out->mutable_data<T>(context.GetPlace());
+
+    const int n = SizeToAxis(axis, x->dims());
+    const int d = SizeFromAxis(axis, x->dims());
+
+    auto& dev_ctx = context.template device_context<DeviceContext>();
+    int r = xpu::softmax2d_forward(dev_ctx.x_context(), x->data<float>(),
+                                   out->data<float>(), n, d, d <= 2048);
+    PADDLE_ENFORCE_EQ(
+        r, XPU_SUCCESS,
+        platform::errors::External("XPU API(softmax2d_forward) return wrong "
+                                   "value[%d], please check whether "
+                                   "Baidu Kunlun Card is properly installed.",
+                                   r));
+  }
+};
+
+template <typename DeviceContext, typename T>
+class SoftmaxGradXPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto* out = context.Input<Tensor>("Out");
+    auto* dout = context.Input<Tensor>(framework::GradVarName("Out"));
+    auto* dx = context.Output<Tensor>(framework::GradVarName("X"));
+    const int rank = dx->dims().size();
+    const int axis = CanonicalAxis(context.Attr<int>("axis"), rank);
+
+    // allocate memory on device.
+    dx->mutable_data<T>(context.GetPlace());
+
+    const int n = SizeToAxis(axis, dx->dims());
+    const int d = SizeFromAxis(axis, dx->dims());
+
+    auto& dev_ctx = context.template device_context<DeviceContext>();
+    int r =
+        xpu::softmax2d_backward(dev_ctx.x_context(), out->data<float>(),
+                                dout->data<float>(), dx->data<float>(), n, d);
+    PADDLE_ENFORCE_EQ(
+        r, XPU_SUCCESS,
+        platform::errors::External("XPU API(softmax2d_backward) return wrong "
+                                   "value[%d], please check whether "
+                                   "Baidu Kunlun Card is properly installed.",
+                                   r));
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+
+REGISTER_OP_XPU_KERNEL(
+    softmax, ops::SoftmaxXPUKernel<paddle::platform::XPUDeviceContext, float>);
+REGISTER_OP_XPU_KERNEL(
+    softmax_grad,
+    ops::SoftmaxGradXPUKernel<paddle::platform::XPUDeviceContext, float>);
+
+#endif  // PADDLE_WITH_XPU
diff --git a/paddle/fluid/operators/softmax_with_cross_entropy_op.cu b/paddle/fluid/operators/softmax_with_cross_entropy_op.cu
index ba56e5e36f985..3ac7a5a127b37 100644
--- a/paddle/fluid/operators/softmax_with_cross_entropy_op.cu
+++ b/paddle/fluid/operators/softmax_with_cross_entropy_op.cu
@@ -357,7 +357,8 @@ static void HardLabelSoftmaxWithCrossEntropy(
     CALL_HARD_LABEL_SOFTMAX_WITH_CROSS_ENTROPY_FUSED_KERNEL(4);
     CALL_HARD_LABEL_SOFTMAX_WITH_CROSS_ENTROPY_FUSED_KERNEL(2);
     default:
-      PADDLE_THROW("BlockDim must be 2^n in softmax_with_cross_entropy_op");
+      PADDLE_THROW(platform::errors::Unavailable(
+          "Block Dimension must be 2^n in softmax_with_cross_entropy_op."));
       break;
   }
 #undef CALL_HARD_LABEL_SOFTMAX_WITH_CROSS_ENTROPY_FUSED_KERNEL
@@ -397,7 +398,8 @@ static void SoftmaxWithCrossEntropyFusedKernel(const T* logits_data,
     CALL_SOFTMAX_WITH_CROSS_ENTROPY_FUSED_KERNEL(4);
     CALL_SOFTMAX_WITH_CROSS_ENTROPY_FUSED_KERNEL(2);
     default:
-      PADDLE_THROW("BlockDim must be 2^n in softmax_with_cross_entropy_op");
+      PADDLE_THROW(platform::errors::Unavailable(
+          "Block Dimension must be 2^n in softmax_with_cross_entropy_op."));
       break;
   }
 
@@ -408,8 +410,10 @@ template <typename T>
 class SoftmaxWithCrossEntropyCUDAKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
-    PADDLE_ENFORCE(platform::is_gpu_place(context.GetPlace()),
-                   "This kernel only runs on GPU device.");
+    PADDLE_ENFORCE_EQ(
+        platform::is_gpu_place(context.GetPlace()), true,
+        platform::errors::Unavailable("softmax_with_cross_entropy operator's "
+                                      "CUDA kernel only runs on GPU device."));
     const Tensor* logits = context.Input<Tensor>("Logits");
     const Tensor* labels = context.Input<Tensor>("Label");
     Tensor* softmax = context.Output<Tensor>("Softmax");
@@ -469,8 +473,10 @@ template <typename T>
 class SoftmaxWithCrossEntropyGradCUDAKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
-    PADDLE_ENFORCE(platform::is_gpu_place(context.GetPlace()),
-                   "This kernel only runs on GPU device.");
+    PADDLE_ENFORCE_EQ(
+        platform::is_gpu_place(context.GetPlace()), true,
+        platform::errors::Unavailable("softmax_with_cross_entropy operator's "
+                                      "CUDA kernel only runs on GPU device."));
     const Tensor* labels = context.Input<Tensor>("Label");
     const T* loss_grad_data =
         context.Input<Tensor>(framework::GradVarName("Loss"))->data<T>();
diff --git a/paddle/fluid/operators/squeeze_op.cc b/paddle/fluid/operators/squeeze_op.cc
index 93d8f42ce2175..479973a5daa5f 100644
--- a/paddle/fluid/operators/squeeze_op.cc
+++ b/paddle/fluid/operators/squeeze_op.cc
@@ -249,6 +249,19 @@ class Squeeze2GradOp : public framework::OperatorWithKernel {
   }
 };
 
+template <typename T>
+class SqueezeDoubleGradOpMaker : public framework::SingleGradOpMaker<T> {
+ public:
+  using framework::SingleGradOpMaker<T>::SingleGradOpMaker;
+
+  void Apply(GradOpPtr<T> grad_op) const override {
+    grad_op->SetType("squeeze");
+    grad_op->SetInput("X", this->OutputGrad(framework::GradVarName("X")));
+    grad_op->SetOutput("Out", this->InputGrad(framework::GradVarName("Out")));
+    grad_op->SetAttrMap(this->Attrs());
+  }
+};
+
 // FIXME(zcd): squeeze2 adds an intermediate output(XShape) based on squeeze,
 // the XShape is used to carry the shape and lod of X which will be used in
 // squeeze_grad, in this way, the framework can reuse the memory of X
@@ -279,8 +292,22 @@ class Squeeze2GradOpMaker : public framework::SingleGradOpMaker<T> {
   }
 };
 
-DECLARE_INPLACE_OP_INFERER(SequeezeInplaceInferer, {"X", "Out"});
-DECLARE_INPLACE_OP_INFERER(SequeezeGradInplaceInferer,
+template <typename T>
+class Squeeze2DoubleGradOpMaker : public framework::SingleGradOpMaker<T> {
+ public:
+  using framework::SingleGradOpMaker<T>::SingleGradOpMaker;
+
+  void Apply(GradOpPtr<T> grad_op) const override {
+    grad_op->SetType("squeeze2");
+    grad_op->SetInput("X", this->OutputGrad(framework::GradVarName("X")));
+    grad_op->SetOutput("Out", this->InputGrad(framework::GradVarName("Out")));
+    grad_op->SetOutput("XShape", this->Input("XShape"));
+    grad_op->SetAttrMap(this->Attrs());
+  }
+};
+
+DECLARE_INPLACE_OP_INFERER(SqueezeInplaceInferer, {"X", "Out"});
+DECLARE_INPLACE_OP_INFERER(SqueezeGradInplaceInferer,
                            {framework::GradVarName("Out"),
                             framework::GradVarName("X")});
 DECLARE_NO_NEED_BUFFER_VARS_INFERER(SqueezeGradNoNeedBufferVarsInferer, "X");
@@ -292,14 +319,18 @@ REGISTER_OPERATOR(squeeze, ops::SqueezeOp, ops::SqueezeOpMaker,
                   ops::SqueezeGradOpMaker<paddle::framework::OpDesc>,
                   ops::SqueezeGradOpMaker<paddle::imperative::OpBase>);
 REGISTER_OPERATOR(squeeze_grad, ops::SqueezeGradOp,
+                  ops::SqueezeDoubleGradOpMaker<paddle::framework::OpDesc>,
+                  ops::SqueezeDoubleGradOpMaker<paddle::imperative::OpBase>,
                   ops::SqueezeGradNoNeedBufferVarsInferer);
 
 REGISTER_OPERATOR(squeeze2, ops::Squeeze2Op, ops::Squeeze2OpMaker,
                   ops::Squeeze2GradOpMaker<paddle::framework::OpDesc>,
                   ops::Squeeze2GradOpMaker<paddle::imperative::OpBase>,
-                  ops::SequeezeInplaceInferer);
+                  ops::SqueezeInplaceInferer);
 REGISTER_OPERATOR(squeeze2_grad, ops::Squeeze2GradOp,
-                  ops::SequeezeGradInplaceInferer);
+                  ops::Squeeze2DoubleGradOpMaker<paddle::framework::OpDesc>,
+                  ops::Squeeze2DoubleGradOpMaker<paddle::imperative::OpBase>,
+                  ops::SqueezeGradInplaceInferer);
 
 REGISTER_OP_CPU_KERNEL(
     squeeze, ops::SqueezeKernel<paddle::platform::CPUDeviceContext, float>,
diff --git a/paddle/fluid/operators/sum_op_xpu.cc b/paddle/fluid/operators/sum_op_xpu.cc
new file mode 100644
index 0000000000000..14928061d23dd
--- /dev/null
+++ b/paddle/fluid/operators/sum_op_xpu.cc
@@ -0,0 +1,65 @@
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef PADDLE_WITH_XPU
+
+#include "paddle/fluid/operators/sum_op.h"
+#include <vector>
+#include "paddle/fluid/platform/xpu_header.h"
+
+namespace paddle {
+namespace operators {
+using framework::Tensor;
+
+template <typename DeviceContext, typename T>
+class SumXPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext &context) const override {
+    auto in_vars = context.MultiInputVar("X");
+    auto out_var = context.OutputVar("Out");
+    auto *out = context.Output<LoDTensor>("Out");
+    bool in_place = out_var == in_vars[0];
+    int N = in_vars.size();
+    PADDLE_ENFORCE_EQ(
+        out_var->IsType<framework::LoDTensor>(), true,
+        platform::errors::InvalidArgument("XPU only surpport LodTensor"));
+    if (!in_place) {
+      out->mutable_data<T>(context.GetPlace());
+    }
+    auto &dev_ctx = context.template device_context<DeviceContext>();
+    std::vector<const float *> ptrs(N, nullptr);
+    int valid_count = 0;
+    for (int i = 0; i < N; ++i) {
+      PADDLE_ENFORCE_EQ(
+          in_vars[i]->IsType<framework::LoDTensor>(), true,
+          platform::errors::InvalidArgument("XPU only surpport LodTensor"));
+      auto &in_t = in_vars[i]->Get<framework::LoDTensor>();
+      if (in_t.numel() == 0) {
+        continue;
+      }
+      ptrs[valid_count] = reinterpret_cast<const float *>(in_t.data<T>());
+      valid_count++;
+    }
+    int r = xpu::sum_batch(dev_ctx.x_context(), ptrs.data(), out->data<T>(),
+                           valid_count, out->numel());
+    PADDLE_ENFORCE_EQ(r, xpu::Error_t::SUCCESS,
+                      platform::errors::Fatal("XPU kernel error!"));
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+
+REGISTER_OP_XPU_KERNEL(
+    sum, ops::SumXPUKernel<paddle::platform::XPUDeviceContext, float>);
+#endif
diff --git a/paddle/fluid/operators/unsqueeze_op.cc b/paddle/fluid/operators/unsqueeze_op.cc
index ee1361e361830..0e58e1391cfab 100644
--- a/paddle/fluid/operators/unsqueeze_op.cc
+++ b/paddle/fluid/operators/unsqueeze_op.cc
@@ -228,6 +228,19 @@ class UnsqueezeGradOpMaker : public framework::SingleGradOpMaker<T> {
   }
 };
 
+template <typename T>
+class UnsqueezeDoubleGradOpMaker : public framework::SingleGradOpMaker<T> {
+ public:
+  using framework::SingleGradOpMaker<T>::SingleGradOpMaker;
+
+  void Apply(GradOpPtr<T> grad_op) const override {
+    grad_op->SetType("unsqueeze");
+    grad_op->SetInput("X", this->OutputGrad(framework::GradVarName("X")));
+    grad_op->SetOutput("Out", this->InputGrad(framework::GradVarName("Out")));
+    grad_op->SetAttrMap(this->Attrs());
+  }
+};
+
 // FIXME(zcd): unsqueeze2 adds an intermediate output(XShape) based on
 // unsqueeze, the XShape is used to carry the shape and lod of X which
 // will be used in unsqueeze_grad, in this way, the framework can reuse
@@ -304,6 +317,20 @@ class Unsqueeze2GradOp : public framework::OperatorWithKernel {
   }
 };
 
+template <typename T>
+class Unsqueeze2DoubleGradOpMaker : public framework::SingleGradOpMaker<T> {
+ public:
+  using framework::SingleGradOpMaker<T>::SingleGradOpMaker;
+
+  void Apply(GradOpPtr<T> grad_op) const override {
+    grad_op->SetType("unsqueeze2");
+    grad_op->SetInput("X", this->OutputGrad(framework::GradVarName("X")));
+    grad_op->SetOutput("Out", this->InputGrad(framework::GradVarName("Out")));
+    grad_op->SetOutput("XShape", this->Input("XShape"));
+    grad_op->SetAttrMap(this->Attrs());
+  }
+};
+
 DECLARE_INPLACE_OP_INFERER(UnsqueezeInplaceInferer, {"X", "Out"});
 DECLARE_INPLACE_OP_INFERER(UnsqueezeGradInplaceInferer,
                            {framework::GradVarName("Out"),
@@ -317,6 +344,8 @@ REGISTER_OPERATOR(unsqueeze, ops::UnsqueezeOp, ops::UnsqueezeOpMaker,
                   ops::UnsqueezeGradOpMaker<paddle::framework::OpDesc>,
                   ops::UnsqueezeGradOpMaker<paddle::imperative::OpBase>);
 REGISTER_OPERATOR(unsqueeze_grad, ops::UnsqueezeGradOp,
+                  ops::UnsqueezeDoubleGradOpMaker<paddle::framework::OpDesc>,
+                  ops::UnsqueezeDoubleGradOpMaker<paddle::imperative::OpBase>,
                   ops::UnsqueezeGradOpNoNeedBufferVarInferer);
 
 REGISTER_OPERATOR(unsqueeze2, ops::Unsqueeze2Op, ops::Unsqueeze2OpMaker,
@@ -324,6 +353,8 @@ REGISTER_OPERATOR(unsqueeze2, ops::Unsqueeze2Op, ops::Unsqueeze2OpMaker,
                   ops::Unsqueeze2GradOpMaker<paddle::imperative::OpBase>,
                   ops::UnsqueezeInplaceInferer);
 REGISTER_OPERATOR(unsqueeze2_grad, ops::Unsqueeze2GradOp,
+                  ops::Unsqueeze2DoubleGradOpMaker<paddle::framework::OpDesc>,
+                  ops::Unsqueeze2DoubleGradOpMaker<paddle::imperative::OpBase>,
                   ops::UnsqueezeGradInplaceInferer);
 
 REGISTER_OP_CPU_KERNEL(
diff --git a/paddle/fluid/platform/enforce.h b/paddle/fluid/platform/enforce.h
index a3ae9e48eea30..165321d9c87ff 100644
--- a/paddle/fluid/platform/enforce.h
+++ b/paddle/fluid/platform/enforce.h
@@ -47,6 +47,10 @@ limitations under the License. */
 #include <type_traits>
 #include <utility>
 
+#if !defined(_WIN32) && !defined(PADDLE_WITH_MUSL)
+#include <execinfo.h>
+#endif
+
 #define GLOG_NO_ABBREVIATED_SEVERITIES  // msvc conflict logging with windows.h
 #include "glog/logging.h"
 #include "paddle/fluid/platform/errors.h"
@@ -236,13 +240,14 @@ inline std::string SimplifyDemangleStr(std::string str) {
 }
 
 inline std::string GetCurrentTraceBackString() {
-  static constexpr int TRACE_STACK_LIMIT = 100;
   std::ostringstream sout;
 
   sout << "\n\n--------------------------------------\n";
   sout << "C++ Traceback (most recent call last):";
   sout << "\n--------------------------------------\n";
-#if !defined(_WIN32)
+#if !defined(_WIN32) && !defined(PADDLE_WITH_MUSL)
+  static constexpr int TRACE_STACK_LIMIT = 100;
+
   void* call_stack[TRACE_STACK_LIMIT];
   auto size = backtrace(call_stack, TRACE_STACK_LIMIT);
   auto symbols = backtrace_symbols(call_stack, size);
@@ -261,7 +266,7 @@ inline std::string GetCurrentTraceBackString() {
   }
   free(symbols);
 #else
-  sout << "Windows not support stack backtrace yet.\n";
+  sout << "Not support stack backtrace yet.\n";
 #endif
   return sout.str();
 }
diff --git a/paddle/fluid/platform/macros.h b/paddle/fluid/platform/macros.h
index 32b7efc04c1f2..fb5cf9fb31915 100644
--- a/paddle/fluid/platform/macros.h
+++ b/paddle/fluid/platform/macros.h
@@ -25,6 +25,8 @@ limitations under the License. */
   classname& operator=(classname&&) = delete
 #endif
 
+#ifndef PADDLE_WITH_MUSL
 #if defined(__FLT_MAX__)
 #define FLT_MAX __FLT_MAX__
 #endif  // __FLT_MAX__
+#endif  // PADDLE_WITH_MUSL
diff --git a/paddle/fluid/platform/mkldnn_reuse.h b/paddle/fluid/platform/mkldnn_reuse.h
index d1c5480c0f543..785627a09fb27 100644
--- a/paddle/fluid/platform/mkldnn_reuse.h
+++ b/paddle/fluid/platform/mkldnn_reuse.h
@@ -853,6 +853,9 @@ class PoolingMKLDNNHandler : public MKLDNNHandlerT<T, mkldnn::pooling_forward,
         CorrectOutputSize(src_tz, dst_tz, ksize, paddings, strides,
                           mkldnn_paddings[1]);
       }
+
+      ComputeAdaptivePoolParameters(ctx, src_tz, ksize, strides);
+
       this->AcquireForwardPrimitiveDescriptor(
           is_test ? mkldnn::prop_kind::forward_inference
                   : mkldnn::prop_kind::forward_training,
@@ -919,6 +922,27 @@ class PoolingMKLDNNHandler : public MKLDNNHandlerT<T, mkldnn::pooling_forward,
     return mem_p;
   }
 
+  static void ComputeAdaptivePoolParameters(
+      const paddle::framework::ExecutionContext& ctx,
+      const std::vector<int64_t>& src_tz, std::vector<int64_t>& ksize,
+      std::vector<int64_t>& strides) {
+    if (ctx.Attr<bool>("adaptive")) {
+      // (jczaja): oneDNN is supporting only unchangable in size pool window
+      PADDLE_ENFORCE_EQ(
+          src_tz[src_tz.size() - 1] % ksize[1], 0,
+          platform::errors::Unimplemented(
+              "Input dim must be divisible by corressponding ksize dim."));
+      PADDLE_ENFORCE_EQ(
+          src_tz[src_tz.size() - 2] % ksize[0], 0,
+          platform::errors::Unimplemented(
+              "Input dim must be divisible by corressponding ksize dim."));
+      ksize[0] = src_tz[src_tz.size() - 2] / ksize[0];
+      ksize[1] = src_tz[src_tz.size() - 1] / ksize[1];
+      strides[0] = ksize[0];
+      strides[1] = ksize[1];
+    }
+  }
+
  private:
   static inline int ComputeCeiledOutput(int input_size, int kernel_size,
                                         int padding, int stride) {
diff --git a/paddle/fluid/platform/port.h b/paddle/fluid/platform/port.h
index c1b81159aca97..c5e8ff807a2d3 100644
--- a/paddle/fluid/platform/port.h
+++ b/paddle/fluid/platform/port.h
@@ -14,19 +14,18 @@
 
 #pragma once
 
-#include <cstdio>
-#include <stdexcept>
-
 #include <time.h>
+
+#include <cstdio>
 #include <memory>
+#include <stdexcept>
 #include <string>
 
 #define GLOG_NO_ABBREVIATED_SEVERITIES  // msvc conflict logging with windows.h
 #include "glog/logging.h"
 
 #if !defined(_WIN32)
-#include <dlfcn.h>     //  dladdr
-#include <execinfo.h>  // backtrace
+#include <dlfcn.h>  // dladdr
 #include <sys/stat.h>
 #include <sys/time.h>
 #include <algorithm>  // std::accumulate
diff --git a/paddle/fluid/pybind/op_function_generator.cc b/paddle/fluid/pybind/op_function_generator.cc
index 9bc603c0ecc2c..ee6e541c9e6c6 100644
--- a/paddle/fluid/pybind/op_function_generator.cc
+++ b/paddle/fluid/pybind/op_function_generator.cc
@@ -49,6 +49,8 @@ std::map<std::string, std::set<std::string>> op_ins_map = {
      {"MultiLevelRois", "MultiLevelScores", "MultiLevelRoIsNum"}},
     {"distribute_fpn_proposals", {"FpnRois", "RoisNum"}},
     {"warpctc", {"Logits", "Label", "LogitsLength", "LabelLength"}},
+    {"hierarchical_sigmoid",
+     {"X", "W", "Label", "PathTable", "PathCode", "Bias"}},
 };
 
 // NOTE(zhiqiu): Like op_ins_map.
diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc
index b303ddde1366e..0ee725c302215 100644
--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
@@ -36,9 +36,9 @@ limitations under the License. */
 #include "paddle/fluid/framework/lod_rank_table.h"
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/lod_tensor_array.h"
-#include "paddle/fluid/framework/op_compatible_info.h"
 #include "paddle/fluid/framework/op_info.h"
 #include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/op_version_registry.h"
 #include "paddle/fluid/framework/parallel_executor.h"
 #include "paddle/fluid/framework/prune.h"
 #include "paddle/fluid/framework/reader.h"
@@ -432,10 +432,12 @@ PYBIND11_MODULE(core_noavx, m) {
     return map_output;
   });
 
-  m.def("save_op_compatible_info", [](framework::ProgramDesc &desc) {
-    framework::OpCompatibleMap op_compatible_map;
-    op_compatible_map.InitOpCompatibleMap();
-    return op_compatible_map.ConvertToProto(desc.OpCompatibleMap());
+  m.def("save_op_version_info", [](framework::ProgramDesc &desc) {
+    framework::compatible::pb::OpVersionMap pb_vmap{desc.OpVersionMap()};
+    framework::compatible::SaveOpVersions(
+        framework::compatible::OpVersionRegistrar::GetInstance()
+            .GetVersionMap(),
+        &pb_vmap);
   });
 
   m.def(
@@ -1313,9 +1315,6 @@ All parameter, weight, gradient are variables in Paddle.
   py::class_<platform::Communicator>(m, "Communicator").def(py::init<>());
 #endif
   py::class_<platform::CUDAPlace>(m, "CUDAPlace", R"DOC(
-    **Note**:
-        For multi-card tasks, please use `FLAGS_selected_gpus` environment variable to set the visible GPU device.
-        The next version will fix the problem with `CUDA_VISIBLE_DEVICES` environment variable.
 
     CUDAPlace is a descriptor of a device.
     It represents a GPU device allocated or to be allocated with Tensor or LoDTensor.
@@ -1334,8 +1333,10 @@ All parameter, weight, gradient are variables in Paddle.
     Examples:
         .. code-block:: python
 
-          import paddle.fluid as fluid
-          gpu_place = fluid.CUDAPlace(0)
+          import paddle
+
+          place = paddle.CUDAPlace(0)
+          paddle.disable_static(place)
 
         )DOC")
       .def("__init__",
diff --git a/paddle/fluid/train/CMakeLists.txt b/paddle/fluid/train/CMakeLists.txt
index d587081fbac8a..ad4bc20f9f0b1 100644
--- a/paddle/fluid/train/CMakeLists.txt
+++ b/paddle/fluid/train/CMakeLists.txt
@@ -4,37 +4,26 @@ function(train_test TARGET_NAME)
     set(multiValueArgs ARGS)
     cmake_parse_arguments(train_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
 
-    set(arg_list "")
-    if(train_test_ARGS)
-        foreach(arg ${train_test_ARGS})
-            list(APPEND arg_list "_${arg}")
-        endforeach()
+    if (NOT APPLE AND NOT WIN32)
+        cc_test(test_train_${TARGET_NAME}
+                SRCS test_train_${TARGET_NAME}.cc
+                DEPS paddle_fluid_shared
+                ARGS --dirname=${PYTHON_TESTS_DIR}/book/)
     else()
-        list(APPEND arg_list "_")
+        cc_test(test_train_${TARGET_NAME}${arg}
+                SRCS test_train_${TARGET_NAME}.cc
+                DEPS paddle_fluid_api
+                ARGS --dirname=${PYTHON_TESTS_DIR}/book/)
+    endif()
+    set_tests_properties(test_train_${TARGET_NAME}
+            PROPERTIES FIXTURES_REQUIRED test_${TARGET_NAME}_infer_model)
+    if(NOT WIN32 AND NOT APPLE)
+        set_tests_properties(test_train_${TARGET_NAME}
+                PROPERTIES TIMEOUT 150)
     endif()
-    foreach(arg ${arg_list})
-        string(REGEX REPLACE "^_$" "" arg "${arg}")
-        if (NOT APPLE AND NOT WIN32)
-            cc_test(test_train_${TARGET_NAME}${arg}
-                    SRCS test_train_${TARGET_NAME}.cc
-                    DEPS paddle_fluid_shared
-                    ARGS --dirname=${PYTHON_TESTS_DIR}/book/${TARGET_NAME}${arg}.train.model/)
-        else()
-            cc_test(test_train_${TARGET_NAME}${arg}
-                    SRCS test_train_${TARGET_NAME}.cc
-                    DEPS paddle_fluid_api
-                    ARGS --dirname=${PYTHON_TESTS_DIR}/book/${TARGET_NAME}${arg}.train.model/)
-        endif()
-        set_tests_properties(test_train_${TARGET_NAME}${arg}
-                PROPERTIES FIXTURES_REQUIRED test_${TARGET_NAME}_infer_model)
-        if(NOT WIN32 AND NOT APPLE)
-            set_tests_properties(test_train_${TARGET_NAME}${arg}
-                    PROPERTIES TIMEOUT 150)
-        endif()
-    endforeach()
 endfunction(train_test)
 
 
 if(WITH_TESTING)
-  train_test(recognize_digits ARGS mlp conv)
+  train_test(recognize_digits)
 endif()
diff --git a/paddle/fluid/train/test_train_recognize_digits.cc b/paddle/fluid/train/test_train_recognize_digits.cc
index e7b698e1a34e2..fb993439bb8e4 100644
--- a/paddle/fluid/train/test_train_recognize_digits.cc
+++ b/paddle/fluid/train/test_train_recognize_digits.cc
@@ -32,16 +32,15 @@ DEFINE_string(dirname, "", "Directory of the train model.");
 
 namespace paddle {
 
-void Train() {
-  CHECK(!FLAGS_dirname.empty());
+void Train(std::string model_dir) {
   framework::InitDevices(false);
   const auto cpu_place = platform::CPUPlace();
   framework::Executor executor(cpu_place);
   framework::Scope scope;
 
   auto train_program = inference::Load(
-      &executor, &scope, FLAGS_dirname + "__model_combined__.main_program",
-      FLAGS_dirname + "__params_combined__");
+      &executor, &scope, model_dir + "__model_combined__.main_program",
+      model_dir + "__params_combined__");
 
   std::string loss_name = "";
   for (auto op_desc : train_program->Block(0).AllOps()) {
@@ -87,6 +86,10 @@ void Train() {
   EXPECT_LT(last_loss, first_loss);
 }
 
-TEST(train, recognize_digits) { Train(); }
+TEST(train, recognize_digits) {
+  CHECK(!FLAGS_dirname.empty());
+  Train(FLAGS_dirname + "recognize_digits_mlp.train.model/");
+  Train(FLAGS_dirname + "recognize_digits_conv.train.model/");
+}
 
 }  // namespace paddle
diff --git a/python/paddle/__init__.py b/python/paddle/__init__.py
index 3c52bbdcccaf8..0af32da4e690b 100755
--- a/python/paddle/__init__.py
+++ b/python/paddle/__init__.py
@@ -235,7 +235,6 @@
 from .framework import no_grad  #DEFINE_ALIAS
 from .framework import save  #DEFINE_ALIAS
 from .framework import load  #DEFINE_ALIAS
-from .framework import SaveLoadConfig  #DEFINE_ALIAS
 from .framework import DataParallel  #DEFINE_ALIAS
 
 from .framework import NoamDecay  #DEFINE_ALIAS
diff --git a/python/paddle/distributed/fleet/base/distributed_strategy.py b/python/paddle/distributed/fleet/base/distributed_strategy.py
index 1fc29ad042883..c7798b15c67fe 100755
--- a/python/paddle/distributed/fleet/base/distributed_strategy.py
+++ b/python/paddle/distributed/fleet/base/distributed_strategy.py
@@ -744,13 +744,13 @@ def adaptive_localsgd(self):
             strategy.adaptive_localsgd = True # by default this is false
 
         """
-        return self.strategy.localsgd
+        return self.strategy.adaptive_localsgd
 
     @adaptive_localsgd.setter
     @is_strict_auto
     def adaptive_localsgd(self, flag):
         if isinstance(flag, bool):
-            self.strategy.localsgd = flag
+            self.strategy.adaptive_localsgd = flag
         else:
             print("WARNING: adaptive_localsgd should have value of bool type")
 
diff --git a/python/paddle/distributed/fleet/meta_optimizers/amp_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/amp_optimizer.py
index ad96e1426694f..283589c5f3320 100644
--- a/python/paddle/distributed/fleet/meta_optimizers/amp_optimizer.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/amp_optimizer.py
@@ -19,16 +19,14 @@ class AMPOptimizer(MetaOptimizerBase):
     def __init__(self, optimizer):
         super(AMPOptimizer, self).__init__(optimizer)
         self.inner_opt = optimizer
-        self.amp_opt = None
+        self.wrapped_opt = None
         # we do not allow meta optimizer to be inner optimizer currently
         self.meta_optimizers_white_list = [
             "LarsOptimizer",
             "LambOptimizer",
             "RecomputeOptimizer",
-            "LocalSGDOptimizer",
             "GradientMergeOptimizer",
             "GraphExecutionOptimizer",
-            "AdaptiveLocalSGDOptimizer",
         ]
         self.meta_optimizers_black_list = ["DGCOptimizer"]
 
@@ -37,6 +35,24 @@ def _set_basic_info(self, loss, role_maker, user_defined_optimizer,
         super(AMPOptimizer, self)._set_basic_info(
             loss, role_maker, user_defined_optimizer, user_defined_strategy)
 
+    def _init_wrapped_opt(self):
+        if self.wrapped_opt is not None:
+            return
+
+        config = self.user_defined_strategy.amp_configs
+
+        custom_white_list = set(config['custom_white_list'])
+        custom_black_list = set(config['custom_black_list'])
+        custom_black_varnames = set(config['custom_black_varnames'])
+        amp_lists = mixed_precision.AutoMixedPrecisionLists(
+            custom_white_list, custom_black_list, custom_black_varnames)
+
+        self.wrapped_opt = mixed_precision.decorate(
+            self.inner_opt, amp_lists, config['init_loss_scaling'],
+            config['incr_every_n_steps'], config['decr_every_n_nan_or_inf'],
+            config['incr_ratio'], config['decr_ratio'],
+            config['use_dynamic_loss_scaling'])
+
     def _can_apply(self):
         if not self.role_maker._is_collective:
             return False
@@ -60,26 +76,31 @@ def _enable_strategy(self, dist_strategy, context):
             "use_dynamic_loss_scaling": True
         }
 
+    def backward(self,
+                 loss,
+                 startup_program=None,
+                 parameter_list=None,
+                 no_grad_set=None,
+                 callbacks=None):
+        # maybe inner_opt of other meta optimizer
+        self._init_wrapped_opt()
+        return self.wrapped_opt.backward(loss, startup_program, parameter_list,
+                                         no_grad_set, callbacks)
+
+    def apply_gradients(self, params_grads):
+        return self.wrapped_opt.apply_gradients(params_grads=params_grads)
+
+    def apply_optimize(self, loss, startup_program, params_grads):
+        return self.wrapped_opt.apply_optimize(
+            loss, startup_program=startup_program, params_grads=params_grads)
+
     def minimize_impl(self,
                       loss,
                       startup_program=None,
                       parameter_list=None,
                       no_grad_set=None):
-        if self.amp_opt is None:
-            config = self.user_defined_strategy.amp_configs
-            custom_white_list = set(config['custom_white_list'])
-            custom_black_list = set(config['custom_black_list'])
-            custom_black_varnames = set(config['custom_black_varnames'])
-            amp_lists = mixed_precision.AutoMixedPrecisionLists(
-                custom_white_list, custom_black_list, custom_black_varnames)
-
-            self.amp_opt = mixed_precision.decorate(
-                self.inner_opt, amp_lists, config['init_loss_scaling'],
-                config['incr_every_n_steps'], config['decr_every_n_nan_or_inf'],
-                config['incr_ratio'], config['decr_ratio'],
-                config['use_dynamic_loss_scaling'])
-
+        self._init_wrapped_opt()
         optimize_ops, params_grads = \
-            self.amp_opt.minimize(loss, startup_program,
+            self.wrapped_opt.minimize(loss, startup_program,
                                   parameter_list, no_grad_set)
         return optimize_ops, params_grads
diff --git a/python/paddle/distributed/fleet/meta_optimizers/dgc_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/dgc_optimizer.py
index 6806a479d30f4..9990021c8506a 100644
--- a/python/paddle/distributed/fleet/meta_optimizers/dgc_optimizer.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/dgc_optimizer.py
@@ -85,6 +85,13 @@ def backward(self,
         return self.dgc_opt.backward(loss, startup_program, parameter_list,
                                      no_grad_set, callbacks)
 
+    def apply_gradients(self, params_grads):
+        return self.dgc_opt.apply_gradients(params_grads=params_grads)
+
+    def apply_optimize(self, loss, startup_program, params_grads):
+        return self.dgc_opt.apply_optimize(
+            loss, startup_program=startup_program, params_grads=params_grads)
+
     def minimize_impl(self,
                       loss,
                       startup_program=None,
diff --git a/python/paddle/distributed/fleet/meta_optimizers/lamb_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/lamb_optimizer.py
index df9887759e16f..64d54ae3bab03 100755
--- a/python/paddle/distributed/fleet/meta_optimizers/lamb_optimizer.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/lamb_optimizer.py
@@ -98,6 +98,10 @@ def backward(self,
     def apply_gradients(self, params_grads):
         return self.lamb_opt.apply_gradients(params_grads=params_grads)
 
+    def apply_optimize(self, loss, startup_program, params_grads):
+        return self.lamb_opt.apply_optimize(
+            loss, startup_program=startup_program, params_grads=params_grads)
+
     def minimize_impl(self,
                       loss,
                       startup_program=None,
diff --git a/python/paddle/distributed/fleet/meta_optimizers/lars_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/lars_optimizer.py
index 609d8b85e714c..32c6be505a546 100755
--- a/python/paddle/distributed/fleet/meta_optimizers/lars_optimizer.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/lars_optimizer.py
@@ -85,6 +85,10 @@ def backward(self,
     def apply_gradients(self, params_grads):
         return self.lars_opt.apply_gradients(params_grads=params_grads)
 
+    def apply_optimize(self, loss, startup_program, params_grads):
+        return self.lars_opt.apply_optimize(
+            loss, startup_program=startup_program, params_grads=params_grads)
+
     def minimize_impl(self,
                       loss,
                       startup_program=None,
diff --git a/python/paddle/distributed/fleet/meta_optimizers/localsgd_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/localsgd_optimizer.py
index 9f094978d842a..91030f0762934 100644
--- a/python/paddle/distributed/fleet/meta_optimizers/localsgd_optimizer.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/localsgd_optimizer.py
@@ -24,7 +24,7 @@ class LocalSGDOptimizer(MetaOptimizerBase):
     def __init__(self, optimizer):
         super(LocalSGDOptimizer, self).__init__(optimizer)
         self.inner_opt = optimizer
-        self.meta_optimizers_white_list = []
+        self.meta_optimizers_white_list = ['AMPOptimizer']
         self.meta_optimizers_black_list = [
             "GraphExecutionOptimizer",
             "AdaptiveLocalSGDOptimizer",
@@ -195,7 +195,7 @@ class AdaptiveLocalSGDOptimizer(MetaOptimizerBase):
     def __init__(self, optimizer):
         super(AdaptiveLocalSGDOptimizer, self).__init__(optimizer)
         self.inner_opt = optimizer
-        self.meta_optimizers_white_list = []
+        self.meta_optimizers_white_list = ['AMPOptimizer']
         self.meta_optimizers_black_list = [
             "GraphExecutionOptimizer", "LocalSGDOptimizer"
         ]
diff --git a/python/paddle/distributed/fleet/meta_optimizers/recompute_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/recompute_optimizer.py
index 59ca7e633099e..ea2b67ac4bd1f 100644
--- a/python/paddle/distributed/fleet/meta_optimizers/recompute_optimizer.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/recompute_optimizer.py
@@ -18,15 +18,14 @@
 class RecomputeOptimizer(MetaOptimizerBase):
     def __init__(self, optimizer):
         super(RecomputeOptimizer, self).__init__(optimizer)
-        #self.inner_opt = RO(optimizer)
         self.inner_opt = optimizer
-        self.wrapped_opt = RO(optimizer)
+        self.wrapped_opt = None
         # we do not allow meta optimizer to be inner optimizer currently
         self.meta_optimizers_white_list = [
             "LarsOptimizer",
             "LambOptimizer",
-            "GradientMergeOptimizer",
             "GraphExecutionOptimizer",
+            "DGCOptimizer",
         ]
         self.meta_optimizers_black_list = []
 
@@ -34,8 +33,15 @@ def _set_basic_info(self, loss, role_maker, user_defined_optimizer,
                         user_defined_strategy):
         super(RecomputeOptimizer, self)._set_basic_info(
             loss, role_maker, user_defined_optimizer, user_defined_strategy)
-        self.wrapped_opt._set_checkpoints(
-            list(user_defined_strategy.recompute_configs["checkpoints"]))
+
+    def _init_wrapped_opt(self):
+        if self.wrapped_opt is not None:
+            return
+
+        configs = self.user_defined_strategy.recompute_configs
+
+        self.wrapped_opt = RO(self.inner_opt)
+        self.wrapped_opt._set_checkpoints(list(configs["checkpoints"]))
 
     def _can_apply(self):
         if not self.role_maker._is_collective:
@@ -62,14 +68,24 @@ def backward(self,
                  parameter_list=None,
                  no_grad_set=None,
                  callbacks=None):
+        # maybe inner_opt of other meta optimizer
+        self._init_wrapped_opt()
         return self.wrapped_opt.backward(loss, startup_program, parameter_list,
                                          no_grad_set, callbacks)
 
+    def apply_gradients(self, params_grads):
+        return self.wrapped_opt.apply_gradients(params_grads=params_grads)
+
+    def apply_optimize(self, loss, startup_program, params_grads):
+        return self.wrapped_opt.apply_optimize(
+            loss, startup_program=startup_program, params_grads=params_grads)
+
     def minimize_impl(self,
                       loss,
                       startup_program=None,
                       parameter_list=None,
                       no_grad_set=None):
+        self._init_wrapped_opt()
         optimize_ops, params_grads = \
             self.wrapped_opt.minimize(loss, startup_program,
                                       parameter_list, no_grad_set)
diff --git a/python/paddle/fluid/contrib/mixed_precision/decorator.py b/python/paddle/fluid/contrib/mixed_precision/decorator.py
index c9112ac849ce0..529c664e7083c 100644
--- a/python/paddle/fluid/contrib/mixed_precision/decorator.py
+++ b/python/paddle/fluid/contrib/mixed_precision/decorator.py
@@ -16,6 +16,7 @@
 from ... import default_startup_program
 from ... import layers
 from ... import unique_name
+from ... import program_guard
 from . import fp16_utils
 from .fp16_utils import rewrite_program
 from .fp16_utils import update_role_var_grad
@@ -58,21 +59,40 @@ def __init__(self, optimizer, amp_lists, init_loss_scaling,
         self._optimizer = optimizer
         self._amp_lists = amp_lists
         self._param_grads = None
-        self._train_program = default_main_program()
-        self._startup_prog = default_startup_program()
+        self._train_program = None
+
         self._scaled_loss = None
-        self._loss_scaling = layers.create_global_var(
-            name=unique_name.generate("loss_scaling"),
-            shape=[1],
-            value=init_loss_scaling,
-            dtype='float32',
-            persistable=True)
+        self._loss_scaling = None
+        self._init_loss_scaling = init_loss_scaling
         self._use_dynamic_loss_scaling = use_dynamic_loss_scaling
         if self._use_dynamic_loss_scaling:
             self._incr_every_n_steps = incr_every_n_steps
             self._decr_every_n_nan_or_inf = decr_every_n_nan_or_inf
             self._incr_ratio = incr_ratio
             self._decr_ratio = decr_ratio
+            self._num_good_steps = None
+            self._num_bad_steps = None
+
+    def get_loss_scaling(self):
+        """Return the real-time loss scaling factor.
+        """
+        return self._loss_scaling
+
+    def get_scaled_loss(self):
+        """Return the scaled loss.
+        It's useful when you feed customed loss into executor.
+        """
+        return self._scaled_loss
+
+    def _init_amp_var(self):
+        self._loss_scaling = layers.create_global_var(
+            name=unique_name.generate("loss_scaling"),
+            shape=[1],
+            value=self._init_loss_scaling,
+            dtype='float32',
+            persistable=True)
+
+        if self._use_dynamic_loss_scaling:
             self._num_good_steps = layers.create_global_var(
                 name=unique_name.generate("num_good_steps"),
                 shape=[1],
@@ -86,28 +106,16 @@ def __init__(self, optimizer, amp_lists, init_loss_scaling,
                 dtype='int32',
                 persistable=True)
 
-        # Ensure the data type of learning rate vars is float32 (same as the 
+        # Ensure the data type of learning rate vars is float32 (same as the
         # master parameter dtype)
-        if isinstance(optimizer._learning_rate, float):
-            optimizer._learning_rate_map[default_main_program()] = \
-                        layers.create_global_var(
-                        name=unique_name.generate("learning_rate"),
-                        shape=[1],
-                        value=float(optimizer._learning_rate),
-                        dtype='float32',
-                        persistable=True)
-
-    def get_loss_scaling(self):
-        """Return the real-time loss scaling factor.
-        """
-        return self._loss_scaling
-
-    def get_scaled_loss(self):
-        """Return the scaled loss.
-        It's useful when you feed customed loss into executor.
-        """
-
-        return self._scaled_loss
+        if isinstance(self._optimizer._learning_rate, float):
+            self._optimizer._learning_rate_map[default_main_program()] = \
+                    layers.create_global_var(
+                    name=unique_name.generate("learning_rate"),
+                    shape=[1],
+                    value=float(self._optimizer._learning_rate),
+                    dtype='float32',
+                    persistable=True)
 
     def backward(self,
                  loss,
@@ -131,16 +139,21 @@ def backward(self,
             A list of (param, grad), which is a tuple of a parameter and its 
             gradient respectively, and the scaled loss.
         """
-        rewrite_program(self._train_program, self._amp_lists)
-        self._scaled_loss = loss * self._loss_scaling
-        self._params_grads = self._optimizer.backward(
-            self._scaled_loss, startup_program, parameter_list, no_grad_set,
-            callbacks)
-        # Change the op_role_var attr for some ops, so that gradients
-        # transferred across GPUs can be FP16.
-        update_role_var_grad(self._train_program, self._params_grads)
-
-        return self._params_grads
+        train_program = loss.block.program
+        self._train_program = train_program
+
+        with program_guard(train_program, startup_program):
+            self._init_amp_var()
+
+            rewrite_program(train_program, self._amp_lists)
+            self._scaled_loss = loss * self._loss_scaling
+            params_grads = self._optimizer.backward(
+                self._scaled_loss, startup_program, parameter_list, no_grad_set,
+                callbacks)
+            # Change the op_role_var attr for some ops, so that gradients
+            # transferred across GPUs can be FP16.
+            update_role_var_grad(train_program, params_grads)
+        return params_grads
 
     def apply_gradients(self, params_grads):
         """
@@ -182,6 +195,12 @@ def apply_gradients(self, params_grads):
 
         return optimize_ops
 
+    def apply_optimize(self, loss, startup_program, params_grads):
+        program = loss.block.program
+        with program_guard(program, startup_program):
+            optimize_ops = self.apply_gradients(params_grads)
+        return optimize_ops
+
     def minimize(self,
                  loss,
                  startup_program=None,
@@ -207,7 +226,8 @@ def minimize(self,
             parameter_list=parameter_list,
             no_grad_set=no_grad_set)
 
-        optimize_ops = self.apply_gradients(scaled_params_grads)
+        optimize_ops = self.apply_optimize(loss, startup_program,
+                                           scaled_params_grads)
 
         return optimize_ops, scaled_params_grads
 
diff --git a/python/paddle/fluid/contrib/slim/quantization/quantization_pass.py b/python/paddle/fluid/contrib/slim/quantization/quantization_pass.py
index b5a8d90194331..eba881a2637ae 100644
--- a/python/paddle/fluid/contrib/slim/quantization/quantization_pass.py
+++ b/python/paddle/fluid/contrib/slim/quantization/quantization_pass.py
@@ -758,6 +758,7 @@ def _insert_channel_quant_op(self, graph, var_node, name, quant_bits,
             attrs={
                 'bit_length': quant_bits,
                 'quant_axis': quant_axis,
+                'is_test': self._is_test,
                 'op_role': core.op_proto_and_checker_maker.OpRole.Forward
             },
             inputs={'X': var_node},
@@ -1125,7 +1126,7 @@ def apply(self, graph):
                     self._restore_var(input_arg_name, quantized_param_v)
                     self._remove_fake_quant_and_dequant_op(graph, op_node)
 
-# Remove all fake dequant op
+        # Remove all fake dequant op
         ops = graph.all_op_nodes()
         for op_node in ops:
             op_name = op_node.name()
@@ -1331,16 +1332,25 @@ def _is_float(self, v):
 
     def _quant(self, x, scale, num_bits, quant_axis):
         assert quant_axis in [0, 1], 'quant_axis should be 0 or 1 for now.'
+        bnt = (1 << (num_bits - 1)) - 1
+
+        def _clip(x, scale):
+            x[x > scale] = scale
+            x[x < -scale] = -scale
+            return x
+
         if isinstance(scale, list):
             for i, s in enumerate(scale):
                 if quant_axis == 0:
-                    x[i] = np.round(x[i] / s * ((1 << (num_bits - 1)) - 1))
+                    x[i] = _clip(x[i], s)
+                    x[i] = np.round(x[i] / s * bnt)
                 else:
-                    x[:, i] = np.round(x[:, i] / s * (
-                        (1 << (num_bits - 1)) - 1))
-            return x
+                    x[:, i] = _clip(x[:, i], s)
+                    x[:, i] = np.round(x[:, i] / s * bnt)
         else:
-            return np.round(x / scale * ((1 << (num_bits - 1)) - 1))
+            x = _clip(x, scale)
+            x = np.round(x / scale * bnt)
+        return x
 
 
 class ConvertToInt8Pass(object):
diff --git a/python/paddle/fluid/contrib/slim/tests/test_imperative_qat.py b/python/paddle/fluid/contrib/slim/tests/test_imperative_qat.py
index df505cf2435e7..eb924e13a7e4f 100644
--- a/python/paddle/fluid/contrib/slim/tests/test_imperative_qat.py
+++ b/python/paddle/fluid/contrib/slim/tests/test_imperative_qat.py
@@ -31,6 +31,7 @@
 from paddle.fluid.dygraph.nn import Pool2D
 from paddle.fluid.dygraph.nn import Linear
 from paddle.fluid.log_helper import get_logger
+from paddle.fluid.dygraph.io import INFER_MODEL_SUFFIX, INFER_PARAMS_SUFFIX
 
 paddle.enable_static()
 
@@ -231,10 +232,11 @@ def test_qat_save(self):
             before_save = lenet(test_img)
 
         # save inference quantized model
-        path = "./mnist_infer_model"
+        path = "./qat_infer_model/lenet"
+        save_dir = "./qat_infer_model"
         paddle.jit.save(
             layer=lenet,
-            model_path=path,
+            path=path,
             input_spec=[
                 paddle.static.InputSpec(
                     shape=[None, 1, 28, 28], dtype='float32')
@@ -245,12 +247,12 @@ def test_qat_save(self):
         else:
             place = core.CPUPlace()
         exe = fluid.Executor(place)
-        [inference_program, feed_target_names, fetch_targets] = (
-            fluid.io.load_inference_model(
-                dirname=path,
-                executor=exe,
-                model_filename="__model__",
-                params_filename="__variables__"))
+        [inference_program, feed_target_names,
+         fetch_targets] = fluid.io.load_inference_model(
+             dirname=save_dir,
+             executor=exe,
+             model_filename="lenet" + INFER_MODEL_SUFFIX,
+             params_filename="lenet" + INFER_PARAMS_SUFFIX)
         after_save, = exe.run(inference_program,
                               feed={feed_target_names[0]: test_data},
                               fetch_list=fetch_targets)
@@ -339,7 +341,7 @@ def _build_static_lenet(main, startup, is_test=False, seed=1000):
 
         paddle.jit.save(
             layer=lenet,
-            model_path="./dynamic_mnist",
+            path="./dynamic_mnist/model",
             input_spec=[
                 paddle.static.InputSpec(
                     shape=[None, 1, 28, 28], dtype='float32')
diff --git a/python/paddle/fluid/contrib/slim/tests/test_imperative_qat_channelwise.py b/python/paddle/fluid/contrib/slim/tests/test_imperative_qat_channelwise.py
index 80d388ac0da62..ddf37a0ebf8c2 100644
--- a/python/paddle/fluid/contrib/slim/tests/test_imperative_qat_channelwise.py
+++ b/python/paddle/fluid/contrib/slim/tests/test_imperative_qat_channelwise.py
@@ -31,6 +31,7 @@
 from paddle.fluid.dygraph.nn import Pool2D
 from paddle.fluid.dygraph.nn import Linear
 from paddle.fluid.log_helper import get_logger
+from paddle.fluid.dygraph.io import INFER_MODEL_SUFFIX, INFER_PARAMS_SUFFIX
 
 paddle.enable_static()
 
@@ -231,10 +232,11 @@ def test_qat_save(self):
             before_save = lenet(test_img)
 
         # save inference quantized model
-        path = "./mnist_infer_model"
+        path = "./qat_infer_model/mnist"
+        save_dir = "./qat_infer_model"
         paddle.jit.save(
             layer=lenet,
-            model_path=path,
+            path=path,
             input_spec=[
                 paddle.static.InputSpec(
                     shape=[None, 1, 28, 28], dtype='float32')
@@ -245,12 +247,12 @@ def test_qat_save(self):
         else:
             place = core.CPUPlace()
         exe = fluid.Executor(place)
-        [inference_program, feed_target_names, fetch_targets] = (
-            fluid.io.load_inference_model(
-                dirname=path,
-                executor=exe,
-                model_filename="__model__",
-                params_filename="__variables__"))
+        [inference_program, feed_target_names,
+         fetch_targets] = fluid.io.load_inference_model(
+             dirname=save_dir,
+             executor=exe,
+             model_filename="mnist" + INFER_MODEL_SUFFIX,
+             params_filename="mnist" + INFER_PARAMS_SUFFIX)
         after_save, = exe.run(inference_program,
                               feed={feed_target_names[0]: test_data},
                               fetch_list=fetch_targets)
@@ -339,7 +341,7 @@ def _build_static_lenet(main, startup, is_test=False, seed=1000):
 
         paddle.jit.save(
             layer=lenet,
-            model_path="./dynamic_mnist",
+            path="./dynamic_mnist/model",
             input_spec=[
                 paddle.static.InputSpec(
                     shape=[None, 1, 28, 28], dtype='float32')
diff --git a/python/paddle/fluid/core.py b/python/paddle/fluid/core.py
index 9a14c4cdf14a4..ad116c2597064 100644
--- a/python/paddle/fluid/core.py
+++ b/python/paddle/fluid/core.py
@@ -205,8 +205,15 @@ def pre_load(dso_name):
     load_dso(dso_path)
 
 
-def get_glibc_ver():
-    return run_shell_command("ldd --version | awk '/ldd/{print $NF}'")
+def get_libc_ver():
+    ldd_glibc = run_shell_command("ldd --version | awk '/ldd/{print $NF}'")
+    if ldd_glibc is not None:
+        return ("glibc", ldd_glibc)
+
+    ldd_musl = run_shell_command("ldd 2>&1 | awk '/Version/{print $NF}'")
+    if ldd_musl is not None:
+        return ("musl", ldd_musl)
+    return (None, None)
 
 
 def less_than_ver(a, b):
@@ -231,13 +238,14 @@ def to_list(s):
 # For paddle, the problem is that 'libgomp' is a DSO with static TLS, and it is loaded after 14 DSOs.
 # So, here is a tricky way to solve the problem by pre load 'libgomp' before 'core_avx.so'.
 # The final solution is to upgrade glibc to > 2.22 on the target system.
-if platform.system().lower() == 'linux' and less_than_ver(get_glibc_ver(),
-                                                          '2.23'):
-    try:
-        pre_load('libgomp')
-    except Exception as e:
-        # NOTE(zhiqiu): do not abort if failed, since it may success when import core_avx.so
-        sys.stderr.write('Error: Can not preload libgomp.so')
+if platform.system().lower() == 'linux':
+    libc_type, libc_ver = get_libc_ver()
+    if libc_type == 'glibc' and less_than_ver(libc_ver, '2.23'):
+        try:
+            pre_load('libgomp')
+        except Exception as e:
+            # NOTE(zhiqiu): do not abort if failed, since it may success when import core_avx.so
+            sys.stderr.write('Error: Can not preload libgomp.so')
 
 load_noavx = False
 
diff --git a/python/paddle/fluid/dygraph/checkpoint.py b/python/paddle/fluid/dygraph/checkpoint.py
index f4ea4d670e600..fb87ea4455d34 100644
--- a/python/paddle/fluid/dygraph/checkpoint.py
+++ b/python/paddle/fluid/dygraph/checkpoint.py
@@ -24,8 +24,8 @@
 import warnings
 from .. import core
 from .base import guard
-from paddle.fluid.dygraph.jit import SaveLoadConfig, deprecate_save_load_configs
-from paddle.fluid.dygraph.io import _construct_program_holders, _construct_params_and_buffers, EXTRA_VAR_INFO_FILENAME
+from paddle.fluid.dygraph.jit import _SaveLoadConfig
+from paddle.fluid.dygraph.io import _construct_program_holders, _construct_params_and_buffers
 
 __all__ = [
     'save_dygraph',
@@ -33,35 +33,23 @@
 ]
 
 
-# NOTE(chenweihang): deprecate load_dygraph's argument keep_name_table,
-# ensure compatibility when user still use keep_name_table argument
-def deprecate_keep_name_table(func):
-    @functools.wraps(func)
-    def wrapper(*args, **kwargs):
-        def __warn_and_build_configs__(keep_name_table):
-            warnings.warn(
-                "The argument `keep_name_table` has deprecated, please use `SaveLoadConfig.keep_name_table`.",
-                DeprecationWarning)
-            config = SaveLoadConfig()
-            config.keep_name_table = keep_name_table
-            return config
-
-        # deal with arg `keep_name_table`
-        if len(args) > 1 and isinstance(args[1], bool):
-            args = list(args)
-            args[1] = __warn_and_build_configs__(args[1])
-        # deal with kwargs
-        elif 'keep_name_table' in kwargs:
-            kwargs['config'] = __warn_and_build_configs__(kwargs[
-                'keep_name_table'])
-            kwargs.pop('keep_name_table')
-        else:
-            # do nothing
-            pass
+def _parse_load_config(configs):
+    supported_configs = ['model_filename', 'params_filename', 'keep_name_table']
+
+    # input check
+    for key in configs:
+        if key not in supported_configs:
+            raise ValueError(
+                "The additional config (%s) of `paddle.fluid.load_dygraph` is not supported."
+                % (key))
 
-        return func(*args, **kwargs)
+    # construct inner config
+    inner_config = _SaveLoadConfig()
+    inner_config.model_filename = configs.get('model_filename', None)
+    inner_config.params_filename = configs.get('params_filename', None)
+    inner_config.keep_name_table = configs.get('keep_name_table', None)
 
-    return wrapper
+    return inner_config
 
 
 @dygraph_only
@@ -132,12 +120,12 @@ def save_dygraph(state_dict, model_path):
         pickle.dump(model_dict, f, protocol=2)
 
 
+# NOTE(chenweihang): load_dygraph will deprecated in future, we don't 
+# support new loading features for it
 # TODO(qingqing01): remove dygraph_only to support loading static model.
 # maybe need to unify the loading interface after 2.0 API is ready.
 # @dygraph_only
-@deprecate_save_load_configs
-@deprecate_keep_name_table
-def load_dygraph(model_path, config=None):
+def load_dygraph(model_path, **configs):
     '''
     :api_attr: imperative
     
@@ -152,10 +140,13 @@ def load_dygraph(model_path, config=None):
     Args:
         model_path(str) : The file prefix store the state_dict. 
             (The path should Not contain suffix '.pdparams') 
-        config (SaveLoadConfig, optional): :ref:`api_imperative_jit_saveLoadConfig`
-            object that specifies additional configuration options, these options 
-            are for compatibility with ``jit.save/io.save_inference_model`` formats. 
-            Default None.
+        **configs (dict, optional): other save configuration options for compatibility. We do not 
+            recommend using these configurations, if not necessary, DO NOT use them. Default None.
+            The following options are currently supported:
+            (1) model_filename (string): The inference model file name of the paddle 1.x ``save_inference_model`` 
+            save format. Default file name is :code:`__model__` . 
+            (2) params_filename (string): The persistable variables file name of the paddle 1.x ``save_inference_model`` 
+            save format. No default file name, save variables separately by default.
 
     Returns:
         state_dict(dict) : the dict store the state_dict
@@ -196,8 +187,7 @@ def load_dygraph(model_path, config=None):
     opti_file_path = model_prefix + ".pdopt"
 
     # deal with argument `config`
-    if config is None:
-        config = SaveLoadConfig()
+    config = _parse_load_config(configs)
 
     if os.path.exists(params_file_path) or os.path.exists(opti_file_path):
         # Load state dict by `save_dygraph` save format
@@ -246,7 +236,6 @@ def load_dygraph(model_path, config=None):
                 persistable_var_dict = _construct_params_and_buffers(
                     model_prefix,
                     programs,
-                    config.separate_params,
                     config.params_filename,
                     append_suffix=False)
 
@@ -255,9 +244,9 @@ def load_dygraph(model_path, config=None):
                 for var_name in persistable_var_dict:
                     para_dict[var_name] = persistable_var_dict[var_name].numpy()
 
-                # if __variables.info__ exists, we can recover structured_name
-                var_info_path = os.path.join(model_prefix,
-                                             EXTRA_VAR_INFO_FILENAME)
+                # if *.info exists, we can recover structured_name
+                var_info_filename = str(config.params_filename) + ".info"
+                var_info_path = os.path.join(model_prefix, var_info_filename)
                 if os.path.exists(var_info_path):
                     with open(var_info_path, 'rb') as f:
                         extra_var_info = pickle.load(f)
diff --git a/python/paddle/fluid/dygraph/io.py b/python/paddle/fluid/dygraph/io.py
index 4a3dacbd1acae..a10adeb14aa7d 100644
--- a/python/paddle/fluid/dygraph/io.py
+++ b/python/paddle/fluid/dygraph/io.py
@@ -31,8 +31,10 @@
 
 __all__ = ['TranslatedLayer']
 
-VARIABLE_FILENAME = "__variables__"
-EXTRA_VAR_INFO_FILENAME = "__variables.info__"
+INFER_MODEL_SUFFIX = ".pdmodel"
+INFER_PARAMS_SUFFIX = ".pdiparams"
+INFER_PARAMS_INFO_SUFFIX = ".pdiparams.info"
+
 LOADED_VAR_SUFFIX = "load"
 PARAMETER_NAME_PREFIX = "param"
 BUFFER_NAME_PREFIX = "buffer"
@@ -424,11 +426,8 @@ def _load_persistable_vars_by_program(model_path,
     return load_var_dict
 
 
-def _load_persistable_vars(model_path,
-                           var_info_path,
-                           program_holder,
-                           separate_params=False,
-                           params_filename=None):
+def _load_persistable_vars(model_path, var_info_path, program_holder,
+                           params_filename):
     # 1. load extra var info
     with open(var_info_path, 'rb') as f:
         extra_var_info = pickle.load(f)
@@ -464,33 +463,22 @@ def _load_persistable_vars(model_path,
             new_var = framework._varbase_creator(
                 name=new_name, persistable=True)
 
-        # load separate vars
-        if separate_params is True:
-            framework._dygraph_tracer().trace_op(
-                type='load',
-                inputs={},
-                outputs={'Out': new_var},
-                attrs={'file_path': os.path.join(model_path, name)})
-
         new_var.stop_gradient = extra_var_info[name]['stop_gradient']
         load_var_dict[new_name] = new_var
         load_var_list.append(new_var)
 
     # 3. load all vars
-    if separate_params is False:
-        if params_filename is not None:
-            var_file_path = os.path.join(model_path, params_filename)
-        else:
-            var_file_path = os.path.join(model_path, VARIABLE_FILENAME)
-        if not os.path.exists(var_file_path):
-            if len(extra_var_info) != 0:
-                raise ValueError("The model to be loaded is incomplete.")
-        else:
-            framework._dygraph_tracer().trace_op(
-                type='load_combine',
-                inputs={},
-                outputs={'Out': load_var_list},
-                attrs={'file_path': var_file_path})
+    assert params_filename is not None, "params_filename should not be None."
+    var_file_path = os.path.join(model_path, params_filename)
+    if not os.path.exists(var_file_path):
+        if len(extra_var_info) != 0:
+            raise ValueError("The model to be loaded is incomplete.")
+    else:
+        framework._dygraph_tracer().trace_op(
+            type='load_combine',
+            inputs={},
+            outputs={'Out': load_var_list},
+            attrs={'file_path': var_file_path})
 
     return load_var_dict
 
@@ -532,14 +520,13 @@ def _construct_program_holders(model_path, model_filename=None):
 
 def _construct_params_and_buffers(model_path,
                                   programs,
-                                  separate_params=False,
                                   params_filename=None,
                                   append_suffix=True):
-    var_info_path = os.path.join(model_path, EXTRA_VAR_INFO_FILENAME)
+    var_info_filename = str(params_filename) + ".info"
+    var_info_path = os.path.join(model_path, var_info_filename)
     if os.path.exists(var_info_path):
         var_dict = _load_persistable_vars(model_path, var_info_path,
-                                          programs['forward'], separate_params,
-                                          params_filename)
+                                          programs['forward'], params_filename)
     else:
         var_dict = _load_persistable_vars_by_program(
             model_path, programs['forward'], params_filename)
@@ -700,18 +687,16 @@ def _construct(model_path, configs=None):
             raise ValueError("There is no directory named '%s'" % model_path)
         model_filename = None
         params_filename = None
-        separate_params = False
         if configs is not None:
             model_filename = configs.model_filename
             params_filename = configs.params_filename
-            separate_params = configs.separate_params
 
         # 1. load program desc & construct _ProgramHolder
         programs = _construct_program_holders(model_path, model_filename)
 
         # 2. load layer parameters & buffers
-        persistable_vars = _construct_params_and_buffers(
-            model_path, programs, separate_params, params_filename)
+        persistable_vars = _construct_params_and_buffers(model_path, programs,
+                                                         params_filename)
 
         # 3. construct TranslatedLayer object
         translated_layer = TranslatedLayer(programs, persistable_vars)
diff --git a/python/paddle/fluid/dygraph/jit.py b/python/paddle/fluid/dygraph/jit.py
index 194ebafb08eef..6cdd13fba82ac 100644
--- a/python/paddle/fluid/dygraph/jit.py
+++ b/python/paddle/fluid/dygraph/jit.py
@@ -29,7 +29,7 @@
 from paddle.fluid.dygraph.dygraph_to_static import logging_utils
 from paddle.fluid.dygraph.dygraph_to_static.logging_utils import set_code_level, set_verbosity
 from paddle.fluid.dygraph.dygraph_to_static.program_translator import ProgramTranslator, StaticFunction, unwrap_decorators
-from paddle.fluid.dygraph.io import EXTRA_VAR_INFO_FILENAME, VARIABLE_FILENAME, TranslatedLayer
+from paddle.fluid.dygraph.io import TranslatedLayer, INFER_MODEL_SUFFIX, INFER_PARAMS_SUFFIX, INFER_PARAMS_INFO_SUFFIX
 from paddle.fluid.dygraph.layers import Layer
 from paddle.fluid.executor import Executor, scope_guard
 from paddle.fluid.framework import Block, ParamBase, Program, Variable
@@ -39,7 +39,7 @@
 
 __all__ = [
     'TracedLayer', 'declarative', 'dygraph_to_static_func', 'set_code_level',
-    'set_verbosity', 'save', 'load', 'SaveLoadConfig'
+    'set_verbosity', 'save', 'load'
 ]
 
 
@@ -228,73 +228,7 @@ def decorated(python_func):
     return decorated
 
 
-class SaveLoadConfig(object):
-    """
-    The additional configuration options may be used in function 
-    ``paddle.jit.save/load`` and ``paddle.load`` .
-    
-    Examples:
-        1. Using ``SaveLoadConfig`` when saving model
-
-        .. code-block:: python
-
-            import paddle
-            import paddle.nn as nn
-            import paddle.optimizer as opt
-
-            class SimpleNet(nn.Layer):
-                def __init__(self, in_size, out_size):
-                    super(SimpleNet, self).__init__()
-                    self._linear = nn.Linear(in_size, out_size)
-
-                @paddle.jit.to_static
-                def forward(self, x):
-                    y = self._linear(x)
-                    z = self._linear(y)
-                    return z
-
-            # enable dygraph mode
-            paddle.disable_static() 
-
-            # train model
-            net = SimpleNet(8, 8)
-            adam = opt.Adam(learning_rate=0.1, parameters=net.parameters())
-            x = paddle.randn([4, 8], 'float32')
-            for i in range(10):
-                out = net(x)
-                loss = paddle.tensor.mean(out)
-                loss.backward()
-                adam.step()
-                adam.clear_grad()
-
-            # use SaveLoadconfig when saving model
-            model_path = "simplenet.example.model"
-            config = paddle.SaveLoadConfig()
-            config.model_filename = "__simplenet__"
-            paddle.jit.save(
-                layer=net,
-                model_path=model_path,
-                config=config)
-
-        2. Using ``SaveLoadConfig`` when loading model
-
-        .. code-block:: python
-
-            import paddle
-
-            # enable dygraph mode
-            paddle.disable_static() 
-
-            # use SaveLoadconfig when loading model
-            model_path = "simplenet.example.model"
-            config = paddle.SaveLoadConfig()
-            config.model_filename = "__simplenet__"
-            infer_net = paddle.jit.load(model_path, config=config)
-            # inference
-            x = paddle.randn([4, 8], 'float32')
-            pred = infer_net(x)
-    """
-
+class _SaveLoadConfig(object):
     def __init__(self):
         self._output_spec = None
         self._model_filename = None
@@ -316,335 +250,105 @@ def __init__(self):
 
     @property
     def output_spec(self):
-        """
-        Selects the output targets of the saved model ( ``paddle.jit.TranslatedLayer`` ).
-        By default, all return variables of original Layer's forward function
-        are kept as the output of the saved TranslatedLayer.
-
-        The ``output_spec`` type should be list[Variable]. If the provided ``output_spec``
-        list is not all output variables, the saved model will be pruned according to the
-        given ``output_spec`` list.
-
-        .. note::
-            The ``output_spec`` is only used when saving model.
-
-        Examples:
-            .. code-block:: python
-
-                import paddle
-                import paddle.nn as nn
-                import paddle.optimizer as opt
-
-                class SimpleNet(nn.Layer):
-                    def __init__(self, in_size, out_size):
-                        super(SimpleNet, self).__init__()
-                        self._linear = nn.Linear(in_size, out_size)
-
-                    @paddle.jit.to_static
-                    def forward(self, x):
-                        y = self._linear(x)
-                        z = self._linear(y)
-                        loss = paddle.tensor.mean(z)
-                        return z, loss
-
-                # enable dygraph mode
-                paddle.disable_static() 
-
-                # train model
-                net = SimpleNet(8, 8)
-                adam = opt.Adam(learning_rate=0.1, parameters=net.parameters())
-                x = paddle.randn([4, 8], 'float32')
-                for i in range(10):
-                    out, loss = net(x)
-                    loss.backward()
-                    adam.step()
-                    adam.clear_grad()
-
-                # use SaveLoadconfig.output_spec
-                model_path = "simplenet.example.model.output_spec"
-                config = paddle.SaveLoadConfig()
-                config.output_spec = [out]
-                paddle.jit.save(
-                    layer=net,
-                    model_path=model_path,
-                    config=config)
-
-                infer_net = paddle.jit.load(model_path)
-                x = paddle.randn([4, 8], 'float32')
-                pred = infer_net(x)
-        """
         return self._output_spec
 
     @output_spec.setter
     def output_spec(self, spec):
+        if spec is None:
+            return
         if not isinstance(spec, list):
             raise TypeError(
-                "The SaveLoadConfig.output_spec should be 'list', but received input type is %s."
+                "The config `output_spec` should be 'list', but received input type is %s."
                 % type(input))
             for var in spec:
                 if not isinstance(var, core.VarBase):
                     raise TypeError(
-                        "The element in SaveLoadConfig.output_spec list should be 'Variable', but received element's type is %s."
+                        "The element in config `output_spec` list should be 'Variable', but received element's type is %s."
                         % type(var))
         self._output_spec = spec
 
     @property
     def model_filename(self):
-        """
-        The name of file to save the translated program of target Layer.
-        Default filename is :code:`__model__` .
-
-        Examples:
-            .. code-block:: python
-
-                import paddle
-                import paddle.nn as nn
-                import paddle.optimizer as opt
-
-                class SimpleNet(nn.Layer):
-                    def __init__(self, in_size, out_size):
-                        super(SimpleNet, self).__init__()
-                        self._linear = nn.Linear(in_size, out_size)
-
-                    @paddle.jit.to_static
-                    def forward(self, x):
-                        y = self._linear(x)
-                        z = self._linear(y)
-                        return z
-
-                # enable dygraph mode
-                paddle.disable_static() 
-
-                # train model
-                net = SimpleNet(8, 8)
-                adam = opt.Adam(learning_rate=0.1, parameters=net.parameters())
-                x = paddle.randn([4, 8], 'float32')
-                for i in range(10):
-                    out = net(x)
-                    loss = paddle.tensor.mean(out)
-                    loss.backward()
-                    adam.step()
-                    adam.clear_grad()
-
-                # saving with configs.model_filename
-                model_path = "simplenet.example.model.model_filename"
-                config = paddle.SaveLoadConfig()
-                config.model_filename = "__simplenet__"
-                paddle.jit.save(
-                    layer=net,
-                    model_path=model_path,
-                    config=config)
-
-                # loading with configs.model_filename
-                infer_net = paddle.jit.load(model_path, config=config)
-                x = paddle.randn([4, 8], 'float32')
-                pred = infer_net(x)
-        """
         return self._model_filename
 
     @model_filename.setter
     def model_filename(self, filename):
+        if filename is None:
+            return
         if not isinstance(filename, six.string_types):
             raise TypeError(
-                "The SaveLoadConfig.model_filename should be str, but received input's type is %s."
+                "The config `model_filename` should be str, but received input's type is %s."
                 % type(filename))
         if len(filename) == 0:
-            raise ValueError(
-                "The SaveLoadConfig.model_filename is empty string.")
+            raise ValueError("The config `model_filename` is empty string.")
         self._model_filename = filename
 
     @property
     def params_filename(self):
-        """
-        The name of file to save all persistable variables in target Layer. 
-        Default file name is :code:`__variables__` .
-        
-        Examples:
-            .. code-block:: python
-
-                import paddle
-                import paddle.nn as nn
-                import paddle.optimizer as opt
-
-                class SimpleNet(nn.Layer):
-                    def __init__(self, in_size, out_size):
-                        super(SimpleNet, self).__init__()
-                        self._linear = nn.Linear(in_size, out_size)
-
-                    @paddle.jit.to_static
-                    def forward(self, x):
-                        y = self._linear(x)
-                        z = self._linear(y)
-                        return z
-
-                # enable dygraph mode
-                paddle.disable_static() 
-
-                # train model
-                net = SimpleNet(8, 8)
-                adam = opt.Adam(learning_rate=0.1, parameters=net.parameters())
-                x = paddle.randn([4, 8], 'float32')
-                for i in range(10):
-                    out = net(x)
-                    loss = paddle.tensor.mean(out)
-                    loss.backward()
-                    adam.step()
-                    adam.clear_grad()
-
-                model_path = "simplenet.example.model.params_filename"
-                config = paddle.SaveLoadConfig()
-                config.params_filename = "__params__"
-
-                # saving with configs.params_filename
-                paddle.jit.save(
-                    layer=net,
-                    model_path=model_path,
-                    config=config)
-
-                # loading with configs.params_filename
-                infer_net = paddle.jit.load(model_path, config=config)
-                x = paddle.randn([4, 8], 'float32')
-                pred = infer_net(x)
-        """
         return self._params_filename
 
     @params_filename.setter
     def params_filename(self, filename):
+        if filename is None:
+            return
         if not isinstance(filename, six.string_types):
             raise TypeError(
-                "The SaveLoadConfig.params_filename should be str, but received input's type is %s."
+                "The config `params_filename` should be str, but received input's type is %s."
                 % type(filename))
         if len(filename) == 0:
-            raise ValueError(
-                "The SaveLoadConfig.params_filename is empty string.")
+            raise ValueError("The config `params_filename` is empty string.")
         self._params_filename = filename
 
-    # NOTE: [why not use params_filename=None control params saved separately]
-    # The new save interface does not recommend parameters to be saved separately. 
-    # Here, the concept should be separated as clearly as possible. 
-    # Setting params_filename=None only means that the saved file name is set 
-    # and without any other meaning. New separate_params control for file saved
-    # separately can makes the concept clearer.
-    @property
-    def separate_params(self):
-        """
-        Configure whether to save the Layer parameters as separete files.
-        (In order to be compatible with the behavior of ``paddle.static.save_inference_model`` )
-
-        If True, each parameter will be saved to a file separately, the file name is the parameter name,
-        and the SaveLoadConfig.params_filename configuration will not take effect. Default False.
-
-        .. note::
-            Only used for ``paddle.jit.save`` .
-
-        Examples:
-            .. code-block:: python
-
-                import paddle
-                import paddle.nn as nn
-                import paddle.optimizer as opt
-
-                class SimpleNet(nn.Layer):
-                    def __init__(self, in_size, out_size):
-                        super(SimpleNet, self).__init__()
-                        self._linear = nn.Linear(in_size, out_size)
-
-                    @paddle.jit.to_static
-                    def forward(self, x):
-                        y = self._linear(x)
-                        z = self._linear(y)
-                        return z
-
-                # enable dygraph mode
-                paddle.disable_static() 
-
-                # train model
-                net = SimpleNet(8, 8)
-                adam = opt.Adam(learning_rate=0.1, parameters=net.parameters())
-                x = paddle.randn([4, 8], 'float32')
-                for i in range(10):
-                    out = net(x)
-                    loss = paddle.tensor.mean(out)
-                    loss.backward()
-                    adam.step()
-                    adam.clear_grad()
-
-                model_path = "simplenet.example.model.separate_params"
-                config = paddle.SaveLoadConfig()
-                config.separate_params = True
-
-                # saving with configs.separate_params
-                paddle.jit.save(
-                    layer=net,
-                    model_path=model_path,
-                    config=config)
-                # [result] the saved model directory contains:
-                # linear_0.b_0  linear_0.w_0  __model__  __variables.info__
-
-                # loading with configs.params_filename
-                infer_net = paddle.jit.load(model_path, config=config)
-                x = paddle.randn([4, 8], 'float32')
-                pred = infer_net(x)
-        """
-        return self._separate_params
-
-    @separate_params.setter
-    def separate_params(self, value):
-        if not isinstance(value, bool):
-            raise TypeError(
-                "The SaveLoadConfig.separate_params should be bool value, but received input's type is %s."
-                % type(value))
-        self._separate_params = value
-
     @property
     def keep_name_table(self):
-        """
-        Configures whether keep ``structured_name -> parameter_name`` dict in loaded state dict.
-        This dict is the debugging information saved when call ``paddle.save`` . 
-        It is generally only used for debugging and does not affect the actual training or inference. 
-        By default, it will not be retained in ``paddle.load`` result. Default: False.
-        
-        .. note::
-            Only used for ``paddle.load`` .
-
-        Examples:
-            .. code-block:: python
-
-                import paddle
-            
-                paddle.disable_static()
-
-                linear = paddle.nn.Linear(5, 1)
-
-                state_dict = linear.state_dict()
-                paddle.save(state_dict, "paddle_dy.pdparams")
-
-                config = paddle.SaveLoadConfig()
-                config.keep_name_table = True
-                para_state_dict = paddle.load("paddle_dy.pdparams", config)
-
-                print(para_state_dict)
-                # the name_table is 'StructuredToParameterName@@'
-                # {'bias': array([0.], dtype=float32), 
-                #  'StructuredToParameterName@@': 
-                #     {'bias': u'linear_0.b_0', 'weight': u'linear_0.w_0'}, 
-                #  'weight': array([[ 0.04230034],
-                #     [-0.1222527 ],
-                #     [ 0.7392676 ],
-                #     [-0.8136974 ],
-                #     [ 0.01211023]], dtype=float32)}
-        """
         return self._keep_name_table
 
     @keep_name_table.setter
     def keep_name_table(self, value):
+        if value is None:
+            return
         if not isinstance(value, bool):
             raise TypeError(
-                "The SaveLoadConfig.keep_name_table should be bool value, but received input's type is %s."
+                "The config `keep_name_table` should be bool value, but received input's type is %s."
                 % type(value))
         self._keep_name_table = value
 
 
+def _parse_save_configs(configs):
+    supported_configs = ['output_spec']
+
+    # input check
+    for key in configs:
+        if key not in supported_configs:
+            raise ValueError(
+                "The additional config (%s) of `paddle.jit.save` is not supported."
+                % (key))
+
+    # construct inner config
+    inner_config = _SaveLoadConfig()
+    inner_config.output_spec = configs.get('output_spec', None)
+
+    return inner_config
+
+
+def _parse_load_config(configs):
+    supported_configs = ['model_filename', 'params_filename']
+
+    # input check
+    for key in configs:
+        if key not in supported_configs:
+            raise ValueError(
+                "The additional config (%s) of `paddle.jit.load` is not supported."
+                % (key))
+
+    # construct inner config
+    inner_config = _SaveLoadConfig()
+    inner_config.model_filename = configs.get('model_filename', None)
+    inner_config.params_filename = configs.get('params_filename', None)
+
+    return inner_config
+
+
 def _get_input_var_names(inputs, input_spec):
     name_none_error = "The %s's name is None. " \
         "When using jit.save, please set InputSepc's name in " \
@@ -712,47 +416,88 @@ def _get_output_vars(outputs, output_spec):
     return result_list
 
 
-# NOTE(chenweihang): change jit.save/load argument `configs` to `config`
-def deprecate_save_load_configs(func):
-    @functools.wraps(func)
-    def wrapper(*args, **kwargs):
-        if 'configs' in kwargs:
-            kwargs['config'] = kwargs['configs']
-            kwargs.pop('configs')
-        return func(*args, **kwargs)
+# NOTE(chenweihang): [ Handling of use cases of API paddle.jit.load ]
+# `paddle.jit.load` may be used to load saved results of:
+# 1. Expected cases:
+#   - paddle.jit.save
+#   - paddle.static.save_inference_model
+#   - paddle.fluid.io.save_inference_model
+# 2. Error cases:
+#   - paddle.save: no .pdmodel for prefix
+#   - paddle.static.save: no .pdiparams but .pdparams exists
+#   - paddle.fluid.io.save_params/save_persistables: no __model__
+# TODO(chenweihang): polish error message in above error cases
+def _build_load_path_and_config(path, config):
+    # NOTE(chenweihang): If both [prefix save format] and [directory save format] exist,
+    # raise error, avoid confusing behavior
+    prefix_format_path = path + INFER_MODEL_SUFFIX
+    prefix_format_exist = os.path.exists(prefix_format_path)
+    directory_format_exist = os.path.isdir(path)
+    if prefix_format_exist and directory_format_exist:
+        raise ValueError(
+            "The %s.pdmodel and %s directory exist at the same time, "
+            "don't know which one to load, please make sure that the specified target "
+            "of ``path`` is unique." % (path, path))
+    elif not prefix_format_exist and not directory_format_exist:
+        raise ValueError("The ``path`` (%s) to load model not exists." % path)
+    else:
+        if prefix_format_exist:
+            file_prefix = os.path.basename(path)
+            model_path = os.path.dirname(path)
+            if config.model_filename is not None:
+                warnings.warn(
+                    "When loading the result saved with the "
+                    "specified file prefix, the ``model_filename`` config does "
+                    "not take effect.")
+            config.model_filename = file_prefix + INFER_MODEL_SUFFIX
+            if config.params_filename is not None:
+                warnings.warn(
+                    "When loading the result saved with the "
+                    "specified file prefix, the ``params_filename`` config does "
+                    "not take effect.")
+            config.params_filename = file_prefix + INFER_PARAMS_SUFFIX
+        else:
+            # Compatible with the old save_inference_model format
+            model_path = path
 
-    return wrapper
+    return model_path, config
 
 
-@deprecate_save_load_configs
 @switch_to_static_graph
-def save(layer, model_path, input_spec=None, config=None):
+def save(layer, path, input_spec=None, **configs):
     """
-    Saves input declarative Layer as :ref:`api_imperative_TranslatedLayer` 
+    Saves input Layer as ``paddle.jit.TranslatedLayer``
     format model, which can be used for inference or fine-tuning after loading.
 
     It will save the translated program and all related persistable 
-    variables of input declarative Layer to given ``model_path``.
+    variables of input Layer to given ``path``.
     
-    The default saved translated program file name is ``__model__``,
-    and the default saved persistable variables file name is ``__variables__``,
-    and it also saved some additional variable description information to file 
-    ``__variables.info__``, these additional information is used in fine-tuning.
+    ``path`` is the prefix of saved objects, and the saved translated program file 
+    suffix is ``.pdmodel``, the saved persistable variables file suffix is ``.pdiparams``,
+    and here also saved some additional variable description information to a file,  
+    its suffix is ``.pdiparams.info``, these additional information is used in fine-tuning.
 
     The saved model can be loaded by follow APIs:
-      - :ref:`api_imperative_jit_load`
-      - :ref:`api_fluid_io_load_inference_model` (need pass ``params_filename='__variables__'``)
+      - ``paddle.jit.load`` 
+      - ``paddle.static.load_inference_model`` 
       - Other C++ inference APIs
 
     Args:
-        layer (Layer): the Layer to be saved. The Layer should be decorated by `@declarative`.
-        model_path (str): the directory to save the model.
-        input_spec (list[Variable], optional): Describes the input of the saved model. 
+        layer (Layer): the Layer to be saved. The Layer should be decorated by `@paddle.jit.to_static`.
+        path (str): The path prefix to save model. The format is ``dirname/file_prefix`` or ``file_prefix``.
+        input_spec (list[InputSpec|Tensor], optional): Describes the input of the saved model. 
             It is the example inputs that will be passed to saved TranslatedLayer's forward
             function. If None, all input variables of the original Layer's forward function
             would be the inputs of the saved model. Default None.
-        config (SaveLoadConfig, optional): :ref:`api_imperative_jit_saveLoadConfig` object
-            that specifies additional configuration options. Default None.
+        **configs (dict, optional): other save configuration options for compatibility. We do not 
+            recommend using these configurations, they may be removed in the future. If not necessary, 
+            DO NOT use them. Default None.
+            The following options are currently supported:
+            (1) output_spec (list[Tensor]): Selects the output targets of the saved model.
+            By default, all return variables of original Layer's forward function are kept as the 
+            output of the saved model. If the provided ``output_spec`` list is not all output variables, 
+            the saved model will be pruned according to the given ``output_spec`` list. 
+
     Returns:
         None
 
@@ -804,10 +549,6 @@ def train(layer, loader, loss_fn, opt):
                         print("Epoch {} batch {}: loss = {}".format(
                             epoch_id, batch_id, np.mean(loss.numpy())))
 
-            # enable dygraph mode
-            place = paddle.CPUPlace()
-            paddle.disable_static(place) 
-
             # 1. train & save model.
 
             # create network
@@ -818,7 +559,6 @@ def train(layer, loader, loss_fn, opt):
             # create data loader
             dataset = RandomDataset(BATCH_NUM * BATCH_SIZE)
             loader = paddle.io.DataLoader(dataset,
-                places=place,
                 batch_size=BATCH_SIZE,
                 shuffle=True,
                 drop_last=True,
@@ -828,11 +568,11 @@ def train(layer, loader, loss_fn, opt):
             train(layer, loader, loss_fn, adam)
 
             # save
-            model_path = "linear.example.model"
-            paddle.jit.save(layer, model_path)
+            path = "example_model/linear"
+            paddle.jit.save(layer, path)
     """
 
-    # 1. input check
+    # 1. input build & check
     prog_translator = ProgramTranslator()
     if not prog_translator.enable_to_static:
         raise RuntimeError(
@@ -843,9 +583,17 @@ def train(layer, loader, loss_fn, opt):
             "The input layer of paddle.jit.save should be 'Layer', but received layer type is %s."
             % type(layer))
 
-    configs = config
-    if configs is None:
-        configs = SaveLoadConfig()
+    # path check
+    file_prefix = os.path.basename(path)
+    if file_prefix == "":
+        raise ValueError(
+            "The input path MUST be format of dirname/file_prefix "
+            "[dirname\\file_prefix in Windows system], but received "
+            "file_prefix is empty string.")
+
+    dirname = os.path.dirname(path)
+    if dirname and not os.path.exists(dirname):
+        os.makedirs(dirname)
 
     # avoid change user given input_spec
     inner_input_spec = None
@@ -866,6 +614,9 @@ def train(layer, loader, loss_fn, opt):
                     "The element in input_spec list should be 'Variable' or `paddle.static.InputSpec`, but received element's type is %s."
                     % type(var))
 
+    # parse configs
+    configs = _parse_save_configs(configs)
+
     # 2. get program from Layer
     # TODO(chenweihang): add support for other method, not only forward
     if isinstance(layer.forward, StaticFunction):
@@ -927,9 +678,12 @@ def train(layer, loader, loss_fn, opt):
     # 5. save inference model
     from paddle.fluid.io import save_inference_model
 
-    # VARIABLE_FILENAME keep nameing style consistent with '__model__'
-    if configs.params_filename is None:
-        configs.params_filename = VARIABLE_FILENAME
+    # construct new save_inference_model arguments
+    model_path = dirname
+    # NOTE(chenweihang): because prefix contains model and params filename,
+    # so we don't support set model_filename & params_filename 
+    model_filename = file_prefix + INFER_MODEL_SUFFIX
+    params_filename = file_prefix + INFER_PARAMS_SUFFIX
 
     with scope_guard(scope):
         save_inference_model(
@@ -938,9 +692,8 @@ def train(layer, loader, loss_fn, opt):
             target_vars=output_vars,
             executor=Executor(_current_expected_place()),
             main_program=concrete_program.main_program.clone(),
-            model_filename=configs.model_filename,
-            params_filename=None
-            if configs.separate_params else configs.params_filename,
+            model_filename=model_filename,
+            params_filename=params_filename,
             export_for_deployment=configs._export_for_deployment,
             program_only=configs._program_only)
 
@@ -958,23 +711,23 @@ def train(layer, loader, loss_fn, opt):
         # Due to compatibility issues, we cannot change the original storage structure, 
         # but we can save these information in `jit.save` without changing the original 
         # storage to improve user experience. So we save extra information into
-        # file `__variables.info__`
-        extra_var_info_path = os.path.join(model_path, EXTRA_VAR_INFO_FILENAME)
+        # file `***.pdiparams.info`
+        extra_var_info_path = path + INFER_PARAMS_INFO_SUFFIX
         with open(extra_var_info_path, 'wb') as f:
             pickle.dump(extra_var_info, f, protocol=2)
 
 
-@deprecate_save_load_configs
 @dygraph_only
-def load(model_path, config=None):
+def load(path, **configs):
     """
     :api_attr: imperative
 
-    Load model saved by :ref:`api_imperative_jit_save` or :ref:`api_fluid_io_save_inference_model`
-    as :ref:`api_imperative_TranslatedLayer`, then performing inference or fine-tune training.
+    Load model saved by ``paddle.jit.save`` or ``paddle.static.save_inference_model`` or 
+    paddle 1.x API ``paddle.fluid.io.save_inference_model`` as ``paddle.jit.TranslatedLayer``, 
+    then performing inference or fine-tune training.
 
     .. note::
-        For some historical reasons, if you load model saved by :ref:`api_fluid_io_save_inference_model`,
+        If you load model saved by ``paddle.static.save_inference_model`` ,
         there will be the following limitations when using it in fine-tuning:
         1. Imperative mode do not support LoDTensor. All original model's feed targets or parametars that depend on LoD are temporarily unavailable.
         2. All saved model's feed targets need to be passed into TranslatedLayer's forward function.
@@ -982,15 +735,23 @@ def load(model_path, config=None):
         4. The parameter's ``trainable`` information is lost and can not be recovered.
 
     Args:
-        model_path (str): The directory path where the model is saved.
-        config (SaveLoadConfig, optional): :ref:`api_imperative_jit_saveLoadConfig` object that specifies 
-            additional configuration options. Default None.
+        path (str): The path prefix to load model. The format is ``dirname/file_prefix`` or ``file_prefix``.
+        **configs (dict, optional): other load configuration options for compatibility. We do not 
+            recommend using these configurations, they may be removed in the future. If not necessary, 
+            DO NOT use them. Default None.
+            The following options are currently supported:
+            (1) model_filename (string): The inference model file name of the paddle 1.x 
+            ``save_inference_model`` save format. Default file name is :code:`__model__` . 
+            (2) params_filename (string): The persistable variables file name of the paddle 1.x 
+            ``save_inference_model`` save format. No default file name, save variables separately 
+            by default.
+
 
     Returns:
         TranslatedLayer: A Layer object can run saved translated model.
 
     Examples:
-        1. Load model saved by :ref:`api_imperative_jit_save` then performing inference and fine-tune training.
+        1. Load model saved by ``paddle.jit.save`` then performing inference and fine-tune training.
 
         .. code-block:: python
 
@@ -1039,10 +800,6 @@ def train(layer, loader, loss_fn, opt):
                         print("Epoch {} batch {}: loss = {}".format(
                             epoch_id, batch_id, np.mean(loss.numpy())))
 
-            # enable dygraph mode
-            place = paddle.CPUPlace()
-            paddle.disable_static(place) 
-
             # 1. train & save model.
 
             # create network
@@ -1053,7 +810,6 @@ def train(layer, loader, loss_fn, opt):
             # create data loader
             dataset = RandomDataset(BATCH_NUM * BATCH_SIZE)
             loader = paddle.io.DataLoader(dataset,
-                places=place,
                 batch_size=BATCH_SIZE,
                 shuffle=True,
                 drop_last=True,
@@ -1063,13 +819,13 @@ def train(layer, loader, loss_fn, opt):
             train(layer, loader, loss_fn, adam)
 
             # save
-            model_path = "linear.example.model"
-            paddle.jit.save(layer, model_path)
+            path = "example_model/linear"
+            paddle.jit.save(layer, path)
 
             # 2. load model
 
             # load
-            loaded_layer = paddle.jit.load(model_path)
+            loaded_layer = paddle.jit.load(path)
 
             # inference
             loaded_layer.eval()
@@ -1082,15 +838,17 @@ def train(layer, loader, loss_fn, opt):
             train(loaded_layer, loader, loss_fn, adam)
 
 
-        2. Load model saved by :ref:`api_fluid_io_save_inference_model` then performing and fine-tune training.
+        2. Load model saved by ``paddle.fluid.io.save_inference_model`` then performing and fine-tune training.
 
         .. code-block:: python
 
             import numpy as np
             import paddle
             import paddle.fluid as fluid
+            import paddle.static as static
             import paddle.nn as nn
             import paddle.optimizer as opt
+            import paddle.nn.functional as F
 
             BATCH_SIZE = 16
             BATCH_NUM = 4
@@ -1112,18 +870,18 @@ def __getitem__(self, idx):
                 def __len__(self):
                     return self.num_samples
 
-            image = fluid.data(name='image', shape=[None, 784], dtype='float32')
-            label = fluid.data(name='label', shape=[None, 1], dtype='int64')
-            pred = fluid.layers.fc(input=image, size=10, act='softmax')
-            loss = fluid.layers.cross_entropy(input=pred, label=label)
-            avg_loss = fluid.layers.mean(loss)
+            image = static.data(name='image', shape=[None, 784], dtype='float32')
+            label = static.data(name='label', shape=[None, 1], dtype='int64')
+            pred = static.nn.fc(input=image, size=10, act='softmax')
+            loss = F.cross_entropy(input=pred, label=label)
+            avg_loss = paddle.mean(loss)
 
-            optimizer = fluid.optimizer.SGD(learning_rate=0.001)
+            optimizer = paddle.optimizer.SGD(learning_rate=0.001)
             optimizer.minimize(avg_loss)
 
-            place = fluid.CPUPlace()
-            exe = fluid.Executor(place)
-            exe.run(fluid.default_startup_program())
+            place = paddle.CPUPlace()
+            exe = static.Executor(place)
+            exe.run(static.default_startup_program())
 
             # create data loader
             dataset = RandomDataset(BATCH_NUM * BATCH_SIZE)
@@ -1138,7 +896,7 @@ def __len__(self):
             # 1. train and save inference model
             for data in loader():
                 exe.run(
-                    fluid.default_main_program(),
+                    static.default_main_program(),
                     feed=data, 
                     fetch_list=[avg_loss])
 
@@ -1179,6 +937,10 @@ def __len__(self):
                     print("Epoch {} batch {}: loss = {}".format(
                         epoch_id, batch_id, np.mean(loss.numpy())))
     """
+    # 1. construct correct config
+    config = _parse_load_config(configs)
+    model_path, config = _build_load_path_and_config(path, config)
+
     return TranslatedLayer._construct(model_path, config)
 
 
diff --git a/python/paddle/fluid/dygraph/static_runner.py b/python/paddle/fluid/dygraph/static_runner.py
index d482077cd4f2a..e8738da07e993 100644
--- a/python/paddle/fluid/dygraph/static_runner.py
+++ b/python/paddle/fluid/dygraph/static_runner.py
@@ -14,7 +14,7 @@
 
 from __future__ import print_function
 
-from paddle.fluid.dygraph.jit import SaveLoadConfig
+from paddle.fluid.dygraph.jit import _SaveLoadConfig
 from paddle.fluid.dygraph.io import TranslatedLayer
 
 
@@ -31,7 +31,7 @@ class StaticModelRunner(object):
     """
 
     def __new__(cls, model_dir, model_filename=None, params_filename=None):
-        configs = SaveLoadConfig()
+        configs = _SaveLoadConfig()
         if model_filename is not None:
             configs.model_filename = model_filename
         if params_filename is not None:
diff --git a/python/paddle/fluid/executor.py b/python/paddle/fluid/executor.py
index 7d067b6347844..f5660c3fc91a1 100644
--- a/python/paddle/fluid/executor.py
+++ b/python/paddle/fluid/executor.py
@@ -54,11 +54,11 @@ def global_scope():
     Examples:
         .. code-block:: python
 
-          import paddle.fluid as fluid
+          import paddle
           import numpy
 
-          fluid.global_scope().var("data").get_tensor().set(numpy.ones((2, 2)), fluid.CPUPlace())
-          numpy.array(fluid.global_scope().find_var("data").get_tensor())
+          paddle.static.global_scope().var("data").get_tensor().set(numpy.ones((2, 2)), paddle.CPUPlace())
+          numpy.array(paddle.static.global_scope().find_var("data").get_tensor())
     """
     return g_scope
 
diff --git a/python/paddle/fluid/io.py b/python/paddle/fluid/io.py
index fe5b683bdeaa3..bb55aeb70d1f2 100644
--- a/python/paddle/fluid/io.py
+++ b/python/paddle/fluid/io.py
@@ -1346,7 +1346,7 @@ def save_inference_model(dirname,
         append_fetch_ops(main_program, fetch_var_names)
 
         main_program.desc._set_version()
-        paddle.fluid.core.save_op_compatible_info(main_program.desc)
+        paddle.fluid.core.save_op_version_info(main_program.desc)
         with open(model_basename, "wb") as f:
             f.write(main_program.desc.serialize_to_string())
     else:
@@ -1720,7 +1720,7 @@ def get_tensor(var):
     main_program = program.clone()
     program.desc.flush()
     main_program.desc._set_version()
-    paddle.fluid.core.save_op_compatible_info(program.desc)
+    paddle.fluid.core.save_op_version_info(program.desc)
 
     with open(model_path + ".pdmodel", "wb") as f:
         f.write(program.desc.serialize_to_string())
diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py
index dbcd91eedbdf1..8cb0404c18cad 100755
--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -9592,10 +9592,6 @@ def stanh(x, scale_a=0.67, scale_b=1.7159, name=None):
 @templatedoc()
 def hard_sigmoid(x, slope=0.2, offset=0.5, name=None):
     """
-    :alias_main: paddle.nn.functional.hard_sigmoid
-	:alias: paddle.nn.functional.hard_sigmoid,paddle.nn.functional.activation.hard_sigmoid
-	:old_api: paddle.fluid.layers.hard_sigmoid
-
     ${comment}
     Parameters:
         x (${x_type}): ${x_comment}
@@ -9613,9 +9609,15 @@ def hard_sigmoid(x, slope=0.2, offset=0.5, name=None):
         .. code-block:: python
 
             import paddle.fluid as fluid
+            import paddle
+            paddle.enable_static()
+
             data = fluid.layers.fill_constant(shape=[3, 2], value=0.5, dtype='float32') # [[0.5, 0.5], [0.5, 0.5], [0.5, 0.5]]
             result = fluid.layers.hard_sigmoid(data) # [[0.6, 0.6], [0.6, 0.6], [0.6, 0.6]]
     """
+    if in_dygraph_mode():
+        return core.ops.hard_sigmoid(x, 'slope', slope, 'offset', offset)
+
     check_variable_and_dtype(x, 'x', ['float16', 'float32', 'float64'],
                              'hard_sigmoid')
 
@@ -9802,10 +9804,6 @@ def prelu(x, mode, param_attr=None, name=None):
 @templatedoc()
 def brelu(x, t_min=0.0, t_max=24.0, name=None):
     """
-    :alias_main: paddle.nn.functional.brelu
-	:alias: paddle.nn.functional.brelu,paddle.nn.functional.activation.brelu
-	:old_api: paddle.fluid.layers.brelu
-
     ${comment}
     Args:
         x(${x_type}): ${x_comment}
@@ -9821,7 +9819,9 @@ def brelu(x, t_min=0.0, t_max=24.0, name=None):
     .. code-block:: python
 
             import paddle.fluid as fluid
+            import paddle
             import numpy as np
+            paddle.enable_static()
 
             input_brelu = np.array([[-1,6],[1,15.6]])
             with fluid.dygraph.guard():
@@ -9831,6 +9831,9 @@ def brelu(x, t_min=0.0, t_max=24.0, name=None):
                 #[[ 1.  6.]
                 #[ 1. 10.]]
     """
+    if in_dygraph_mode():
+        return core.ops.brelu(x, 't_min', t_min, 't_max', t_max)
+
     check_variable_and_dtype(x, 'x', ['float16', 'float32', 'float64'], 'brelu')
 
     helper = LayerHelper('brelu', **locals())
@@ -12564,13 +12567,10 @@ def mul(x, y, x_num_col_dims=1, y_num_col_dims=1, name=None):
     return out
 
 
+@deprecated(since="2.0.0", update_to="paddle.nn.functional.maxout")
 @templatedoc()
 def maxout(x, groups, name=None, axis=1):
     """
-    :alias_main: paddle.nn.functional.maxout
-	:alias: paddle.nn.functional.maxout,paddle.nn.functional.activation.maxout
-	:old_api: paddle.fluid.layers.maxout
-
     ${comment}
 
     Args:
@@ -12592,31 +12592,16 @@ def maxout(x, groups, name=None, axis=1):
         .. code-block:: python
 
             import paddle.fluid as fluid
+            import paddle
+            paddle.enable_static()
+
             input = fluid.data(
                 name='data',
                 shape=[None, 256, 32, 32],
                 dtype='float32')
             out = fluid.layers.maxout(input, groups=2)
     """
-    check_variable_and_dtype(x, 'x', ['float32', 'float64'], 'maxout')
-
-    helper = LayerHelper("maxout", **locals())
-    if axis not in [1, -1, 3]:
-        raise ValueError(
-            "Attr(axis) should be 1 when data format is NCHW, -1 or 3 when data format is NHWC. Received "
-            "Attr(axis): %s." % str(axis))
-    if axis == -1:
-        axis = 3
-
-    out = helper.create_variable_for_type_inference(dtype=x.dtype)
-
-    helper.append_op(
-        type="maxout",
-        inputs={"X": x},
-        attrs={"groups": groups,
-               "axis": axis},
-        outputs={"Out": out})
-    return out
+    return paddle.nn.functional.maxout(**locals())
 
 
 def space_to_depth(x, blocksize, name=None):
@@ -13187,12 +13172,10 @@ def add_position_encoding(input, alpha, beta, name=None):
     Examples:
         .. code-block:: python
 
-          import numpy as np
           import paddle
           import paddle.nn.functional as F
 
-          tensor = np.random.randn(16, 32, 64) 
-          tensor = paddle.to_tensor(tensor)
+          tensor = paddle.randn([16, 32, 64])
           position_tensor = F.add_position_encoding(
                 input=tensor, alpha=1.0, beta=1.0)
 
@@ -14879,10 +14862,6 @@ def shard_index(input, index_num, nshards, shard_id, ignore_value=-1):
 @templatedoc()
 def hard_swish(x, threshold=6.0, scale=6.0, offset=3.0, name=None):
     """
-    :alias_main: paddle.nn.functional.hard_swish
-	:alias: paddle.nn.functional.hard_swish,paddle.nn.functional.activation.hard_swish
-	:old_api: paddle.fluid.layers.hard_swish
-
     This operator implements the hard_swish activation function.
     Hard_swish is proposed in MobileNetV3, and performs better in computational stability and efficiency compared to swish function.
     For more details please refer to: https://arxiv.org/pdf/1905.02244.pdf
@@ -14913,7 +14892,9 @@ def hard_swish(x, threshold=6.0, scale=6.0, offset=3.0, name=None):
     .. code-block:: python
 
         import paddle.fluid as fluid
+        import paddle
         import numpy as np
+        paddle.enable_static()
 
         DATATYPE='float32'
 
@@ -14928,6 +14909,10 @@ def hard_swish(x, threshold=6.0, scale=6.0, offset=3.0, name=None):
         out, = exe.run(feed={'x':x_data}, fetch_list=[y.name])
         print(out)  # [[0.66666667, 1.66666667,3., 4.]]
     """
+    if in_dygraph_mode():
+        return core.ops.hard_swish(x, 'threshold', threshold, 'scale', scale,
+                                   'offset', offset)
+
     check_variable_and_dtype(x, 'x', ['float16', 'float32', 'float64'],
                              'hard_swish')
 
diff --git a/python/paddle/fluid/layers/rnn.py b/python/paddle/fluid/layers/rnn.py
index 1abaafbf7b2ab..6e2756be821ae 100644
--- a/python/paddle/fluid/layers/rnn.py
+++ b/python/paddle/fluid/layers/rnn.py
@@ -2541,23 +2541,17 @@ def lstm(input,
     input_shape = list(input.shape)
     input_size = input_shape[-1]
     weight_size = 0
+    num_dirrection = 2 if is_bidirec == True else 1
+
     for i in range(num_layers):
         if i == 0:
-            input_weight_size = (input_size * hidden_size) * 4
+            input_weight_size = (input_size * hidden_size) * 4 * num_dirrection
         else:
-            if is_bidirec:
-                input_weight_size = (hidden_size * 2 * hidden_size) * 4
-            else:
-                input_weight_size = (hidden_size * hidden_size) * 4
+            input_weight_size = (hidden_size * hidden_size) * 4 * num_dirrection
+        hidden_weight_size = (hidden_size * hidden_size) * 4 * num_dirrection
 
-        hidden_weight_size = (hidden_size * hidden_size) * 4
-
-        if is_bidirec:
-            weight_size += (input_weight_size + hidden_weight_size) * 2
-            weight_size += hidden_size * 8 * 2
-        else:
-            weight_size += input_weight_size + hidden_weight_size
-            weight_size += hidden_size * 8
+        weight_size += input_weight_size + hidden_weight_size
+        weight_size += hidden_size * 8 * num_dirrection
 
     weight = helper.create_parameter(
         attr=helper.param_attr,
diff --git a/python/paddle/fluid/optimizer.py b/python/paddle/fluid/optimizer.py
index 4a9ce4454af0b..367be181f4725 100755
--- a/python/paddle/fluid/optimizer.py
+++ b/python/paddle/fluid/optimizer.py
@@ -731,9 +731,6 @@ def _process_distribute_lookuptable(self, param_grads):
                     outputs={"ParamOut": param_and_grad[0]})
         return new_param_grads, (table_param, table_grad), sgd_op
 
-    def _append_dgc_ops(self, param_and_grad):
-        pass
-
     def backward(self,
                  loss,
                  startup_program=None,
@@ -801,9 +798,6 @@ def backward(self,
             with program_guard(program, startup_program):
                 params_grads = append_backward(loss, parameter_list,
                                                act_no_grad_set, callbacks)
-                # Note: since we can't use all_reduce_op now,
-                # dgc_op should be the last op of one grad.
-                self._append_dgc_ops(params_grads)
         return params_grads
 
     def apply_gradients(self, params_grads):
@@ -1569,6 +1563,11 @@ def _dgc_op(self, param_var, clip_var, grad_var, u_var, v_var, k_var,
 
     @imperative_base.no_grad
     def apply_gradients(self, params_grads):
+        # Note: since we can't use all_reduce_op now,
+        # dgc_op should be the last op of one grad.
+        # Maybe need a grad allreduce pass.
+        self._append_dgc_ops(params_grads)
+
         params_grads = sorted(params_grads, key=lambda x: x[0].name)
         params_grads, table_param_and_grad, table_optimize_op = \
             self._process_distribute_lookuptable(params_grads)
@@ -4784,10 +4783,6 @@ def mlp(input_x, input_y, hid_dim=128, label_dim=2):
 
             params_grads = append_backward(
                 loss, parameter_list, no_grad_set, checkpoints=checkpoint_vars)
-            # Note: since we can't use all_reduce_op now,
-            #  dgc_op should be the last op of one grad.
-            if hasattr(self._optimizer, "_append_dgc_ops"):
-                self._optimizer._append_dgc_ops(params_grads)
         return params_grads
 
     def apply_optimize(self, loss, startup_program, params_grads):
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/predictor_utils.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/predictor_utils.py
index ba0adaf32e15d..63edd35f59bd4 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/predictor_utils.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/predictor_utils.py
@@ -28,11 +28,12 @@ class PredictorTools(object):
     Paddle-Inference predictor
     '''
 
-    def __init__(self, model_path, params_file, feeds_var):
+    def __init__(self, model_path, model_file, params_file, feeds_var):
         '''
         __init__
         '''
         self.model_path = model_path
+        self.model_file = model_file
         self.params_file = params_file
 
         self.feeds_var = feeds_var
@@ -43,7 +44,7 @@ def _load_model_and_set_config(self):
         '''
         if os.path.exists(os.path.join(self.model_path, self.params_file)):
             config = AnalysisConfig(
-                os.path.join(self.model_path, "__model__"),
+                os.path.join(self.model_path, self.model_file),
                 os.path.join(self.model_path, self.params_file))
         else:
             config = AnalysisConfig(os.path.join(self.model_path))
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_bert.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_bert.py
index f105dd5e94744..6c26189a4adb3 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_bert.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_bert.py
@@ -12,13 +12,15 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import os
 import time
 import unittest
-
 import numpy as np
+
+import paddle
 import paddle.fluid as fluid
 from paddle.fluid.dygraph.dygraph_to_static import ProgramTranslator
-from paddle.fluid.dygraph.io import VARIABLE_FILENAME
+from paddle.fluid.dygraph.io import INFER_MODEL_SUFFIX, INFER_PARAMS_SUFFIX
 
 from bert_dygraph_model import PretrainModelLayer
 from bert_utils import get_bert_config, get_feed_data_reader
@@ -31,7 +33,10 @@
 SEED = 2020
 STEP_NUM = 10
 PRINT_STEP = 2
-MODEL_SAVE_PATH = "./bert.inference.model"
+MODEL_SAVE_DIR = "./inference"
+MODEL_SAVE_PREFIX = "./inference/bert"
+MODEL_FILENAME = "bert" + INFER_MODEL_SUFFIX
+PARAMS_FILENAME = "bert" + INFER_PARAMS_SUFFIX
 DY_STATE_DICT_SAVE_PATH = "./bert.dygraph"
 
 
@@ -85,7 +90,7 @@ def train(bert_config, data_reader, to_static):
             step_idx += 1
             if step_idx == STEP_NUM:
                 if to_static:
-                    fluid.dygraph.jit.save(bert, MODEL_SAVE_PATH)
+                    fluid.dygraph.jit.save(bert, MODEL_SAVE_PREFIX)
                 else:
                     fluid.dygraph.save_dygraph(bert.state_dict(),
                                                DY_STATE_DICT_SAVE_PATH)
@@ -104,11 +109,15 @@ def train_static(bert_config, data_reader):
 
 
 def predict_static(data):
+    paddle.enable_static()
     exe = fluid.Executor(place)
     # load inference model
     [inference_program, feed_target_names,
      fetch_targets] = fluid.io.load_inference_model(
-         MODEL_SAVE_PATH, executor=exe, params_filename=VARIABLE_FILENAME)
+         MODEL_SAVE_DIR,
+         executor=exe,
+         model_filename=MODEL_FILENAME,
+         params_filename=PARAMS_FILENAME)
     pred_res = exe.run(inference_program,
                        feed=dict(zip(feed_target_names, data)),
                        fetch_list=fetch_targets)
@@ -143,7 +152,7 @@ def predict_dygraph(bert_config, data):
 
 def predict_dygraph_jit(data):
     with fluid.dygraph.guard(place):
-        bert = fluid.dygraph.jit.load(MODEL_SAVE_PATH)
+        bert = fluid.dygraph.jit.load(MODEL_SAVE_PREFIX)
         bert.eval()
 
         src_ids, pos_ids, sent_ids, input_mask, mask_label, mask_pos, labels = data
@@ -155,7 +164,8 @@ def predict_dygraph_jit(data):
 
 
 def predict_analysis_inference(data):
-    output = PredictorTools(MODEL_SAVE_PATH, VARIABLE_FILENAME, data)
+    output = PredictorTools(MODEL_SAVE_DIR, MODEL_FILENAME, PARAMS_FILENAME,
+                            data)
     out = output()
     return out
 
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_bmn.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_bmn.py
index af7e73c41464d..f54f70e4b854b 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_bmn.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_bmn.py
@@ -21,7 +21,7 @@
 from paddle.fluid import ParamAttr
 from paddle.fluid.dygraph import to_variable
 from paddle.fluid.dygraph import ProgramTranslator
-from paddle.fluid.dygraph.io import VARIABLE_FILENAME
+from paddle.fluid.dygraph.io import INFER_MODEL_SUFFIX, INFER_PARAMS_SUFFIX
 
 from predictor_utils import PredictorTools
 
@@ -422,7 +422,10 @@ class Args(object):
     prop_boundary_ratio = 0.5
     num_sample = 2
     num_sample_perbin = 2
-    infer_dir = './bmn_infer_model'
+    model_save_dir = "./inference"
+    model_save_prefix = "./inference/bmn"
+    model_filename = "bmn" + INFER_MODEL_SUFFIX
+    params_filename = "bmn" + INFER_PARAMS_SUFFIX
     dy_param_path = './bmn_dy_param'
 
 
@@ -620,7 +623,7 @@ def train_bmn(args, place, to_static):
 
                 if batch_id == args.train_batch_num:
                     if to_static:
-                        fluid.dygraph.jit.save(bmn, args.infer_dir)
+                        fluid.dygraph.jit.save(bmn, args.model_save_prefix)
                     else:
                         fluid.dygraph.save_dygraph(bmn.state_dict(),
                                                    args.dy_param_path)
@@ -735,13 +738,15 @@ def predict_dygraph(self, data):
             return pred_res
 
     def predict_static(self, data):
+        paddle.enable_static()
         exe = fluid.Executor(self.place)
         # load inference model
         [inference_program, feed_target_names,
          fetch_targets] = fluid.io.load_inference_model(
-             self.args.infer_dir,
+             self.args.model_save_dir,
              executor=exe,
-             params_filename=VARIABLE_FILENAME)
+             model_filename=self.args.model_filename,
+             params_filename=self.args.params_filename)
         pred_res = exe.run(inference_program,
                            feed={feed_target_names[0]: data},
                            fetch_list=fetch_targets)
@@ -750,7 +755,7 @@ def predict_static(self, data):
 
     def predict_dygraph_jit(self, data):
         with fluid.dygraph.guard(self.place):
-            bmn = fluid.dygraph.jit.load(self.args.infer_dir)
+            bmn = fluid.dygraph.jit.load(self.args.model_save_prefix)
             bmn.eval()
 
             x = to_variable(data)
@@ -760,7 +765,9 @@ def predict_dygraph_jit(self, data):
             return pred_res
 
     def predict_analysis_inference(self, data):
-        output = PredictorTools(self.args.infer_dir, VARIABLE_FILENAME, [data])
+        output = PredictorTools(self.args.model_save_dir,
+                                self.args.model_filename,
+                                self.args.params_filename, [data])
         out = output()
         return out
 
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_lac.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_lac.py
index 4d735b565ddbc..d8cb3854d3e23 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_lac.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_lac.py
@@ -26,7 +26,7 @@
 from paddle.fluid.dygraph import to_variable
 from paddle.fluid.dygraph import Embedding, Linear, GRUUnit
 from paddle.fluid.dygraph import declarative, ProgramTranslator
-from paddle.fluid.dygraph.io import VARIABLE_FILENAME
+from paddle.fluid.dygraph.io import INFER_MODEL_SUFFIX, INFER_PARAMS_SUFFIX
 
 from predictor_utils import PredictorTools
 
@@ -395,7 +395,10 @@ class Args(object):
     base_learning_rate = 0.01
     bigru_num = 2
     print_steps = 1
-    model_save_dir = "./lac_model"
+    model_save_dir = "./inference"
+    model_save_prefix = "./inference/lac"
+    model_filename = "lac" + INFER_MODEL_SUFFIX
+    params_filename = "lac" + INFER_PARAMS_SUFFIX
     dy_param_path = "./lac_dy_param"
 
 
@@ -498,13 +501,11 @@ def do_train(args, to_static):
                 step += 1
         # save inference model
         if to_static:
-            configs = fluid.dygraph.jit.SaveLoadConfig()
-            configs.output_spec = [crf_decode]
             fluid.dygraph.jit.save(
                 layer=model,
-                model_path=args.model_save_dir,
+                path=args.model_save_prefix,
                 input_spec=[words, length],
-                configs=configs)
+                output_spec=[crf_decode])
         else:
             fluid.dygraph.save_dygraph(model.state_dict(), args.dy_param_path)
 
@@ -573,13 +574,15 @@ def predict_static(self, batch):
         LAC model contains h_0 created in `__init__` that is necessary for inferring.
         Load inference model to test it's ok for prediction.
         """
+        paddle.enable_static()
         exe = fluid.Executor(self.place)
         # load inference model
         [inference_program, feed_target_names,
          fetch_targets] = fluid.io.load_inference_model(
              self.args.model_save_dir,
              executor=exe,
-             params_filename=VARIABLE_FILENAME)
+             model_filename=self.args.model_filename,
+             params_filename=self.args.params_filename)
 
         words, targets, length = batch
         pred_res = exe.run(
@@ -592,7 +595,7 @@ def predict_static(self, batch):
     def predict_dygraph_jit(self, batch):
         words, targets, length = batch
         with fluid.dygraph.guard(self.place):
-            model = fluid.dygraph.jit.load(self.args.model_save_dir)
+            model = fluid.dygraph.jit.load(self.args.model_save_prefix)
             model.eval()
 
             pred_res = model(to_variable(words), to_variable(length))
@@ -602,8 +605,9 @@ def predict_dygraph_jit(self, batch):
     def predict_analysis_inference(self, batch):
         words, targets, length = batch
 
-        output = PredictorTools(self.args.model_save_dir, VARIABLE_FILENAME,
-                                [words, length])
+        output = PredictorTools(self.args.model_save_dir,
+                                self.args.model_filename,
+                                self.args.params_filename, [words, length])
         out = output()
         return out
 
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_mnist.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_mnist.py
index bd600d2f2dbd6..8a21c4cfd0eca 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_mnist.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_mnist.py
@@ -25,7 +25,7 @@
 from paddle.fluid.dygraph import to_variable
 from paddle.fluid.dygraph.nn import Conv2D, Linear, Pool2D
 from paddle.fluid.optimizer import AdamOptimizer
-from paddle.fluid.dygraph.io import VARIABLE_FILENAME
+from paddle.fluid.dygraph.io import INFER_MODEL_SUFFIX, INFER_PARAMS_SUFFIX
 from paddle.fluid.dygraph.dygraph_to_static import ProgramTranslator
 
 from predictor_utils import PredictorTools
@@ -218,34 +218,39 @@ def train(self, to_static=False):
     def check_jit_save_load(self, model, inputs, input_spec, to_static, gt_out):
         if to_static:
             infer_model_path = "./test_mnist_inference_model_by_jit_save"
-            configs = fluid.dygraph.jit.SaveLoadConfig()
-            configs.output_spec = [gt_out]
+            model_save_dir = "./inference"
+            model_save_prefix = "./inference/mnist"
+            model_filename = "mnist" + INFER_MODEL_SUFFIX
+            params_filename = "mnist" + INFER_PARAMS_SUFFIX
             fluid.dygraph.jit.save(
                 layer=model,
-                model_path=infer_model_path,
+                path=model_save_prefix,
                 input_spec=input_spec,
-                configs=configs)
+                output_spec=[gt_out])
             # load in static mode
             static_infer_out = self.jit_load_and_run_inference_static(
-                infer_model_path, inputs)
+                model_save_dir, model_filename, params_filename, inputs)
             self.assertTrue(np.allclose(gt_out.numpy(), static_infer_out))
             # load in dygraph mode
             dygraph_infer_out = self.jit_load_and_run_inference_dygraph(
-                infer_model_path, inputs)
+                model_save_prefix, inputs)
             self.assertTrue(np.allclose(gt_out.numpy(), dygraph_infer_out))
             # load in Paddle-Inference
             predictor_infer_out = self.predictor_load_and_run_inference_analysis(
-                infer_model_path, inputs)
+                model_save_dir, model_filename, params_filename, inputs)
             self.assertTrue(np.allclose(gt_out.numpy(), predictor_infer_out))
 
     @switch_to_static_graph
-    def jit_load_and_run_inference_static(self, model_path, inputs):
+    def jit_load_and_run_inference_static(self, model_path, model_filename,
+                                          params_filename, inputs):
+        paddle.enable_static()
         exe = fluid.Executor(self.place)
         [inference_program, feed_target_names,
          fetch_targets] = fluid.io.load_inference_model(
              dirname=model_path,
              executor=exe,
-             params_filename=VARIABLE_FILENAME)
+             model_filename=model_filename,
+             params_filename=params_filename)
         assert len(inputs) == len(feed_target_names)
         results = exe.run(inference_program,
                           feed=dict(zip(feed_target_names, inputs)),
@@ -258,8 +263,10 @@ def jit_load_and_run_inference_dygraph(self, model_path, inputs):
         pred = infer_net(inputs[0])
         return pred.numpy()
 
-    def predictor_load_and_run_inference_analysis(self, model_path, inputs):
-        output = PredictorTools(model_path, VARIABLE_FILENAME, inputs)
+    def predictor_load_and_run_inference_analysis(
+            self, model_path, model_filename, params_filename, inputs):
+        output = PredictorTools(model_path, model_filename, params_filename,
+                                inputs)
         out = output()
         return out
 
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_mobile_net.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_mobile_net.py
index a377075062b26..a086bf1455a81 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_mobile_net.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_mobile_net.py
@@ -20,7 +20,7 @@
 from paddle.fluid.param_attr import ParamAttr
 from paddle.fluid.dygraph.nn import Conv2D, Pool2D, BatchNorm, Linear
 from paddle.fluid.dygraph import declarative, ProgramTranslator
-from paddle.fluid.dygraph.io import VARIABLE_FILENAME
+from paddle.fluid.dygraph.io import INFER_MODEL_SUFFIX, INFER_PARAMS_SUFFIX
 
 import unittest
 
@@ -439,7 +439,10 @@ class Args(object):
     train_step = 10
     place = fluid.CUDAPlace(0) if fluid.is_compiled_with_cuda(
     ) else fluid.CPUPlace()
-    model_save_path = model + ".inference.model"
+    model_save_dir = "./inference"
+    model_save_prefix = "./inference/" + model
+    model_filename = model + INFER_MODEL_SUFFIX
+    params_filename = model + INFER_PARAMS_SUFFIX
     dy_state_dict_save_path = model + ".dygraph"
 
 
@@ -504,7 +507,7 @@ def train_mobilenet(args, to_static):
                 t_last = time.time()
                 if batch_id > args.train_step:
                     if to_static:
-                        fluid.dygraph.jit.save(net, args.model_save_path)
+                        fluid.dygraph.jit.save(net, args.model_save_prefix)
                     else:
                         fluid.dygraph.save_dygraph(net.state_dict(),
                                                    args.dy_state_dict_save_path)
@@ -514,11 +517,15 @@ def train_mobilenet(args, to_static):
 
 
 def predict_static(args, data):
+    paddle.enable_static()
     exe = fluid.Executor(args.place)
     # load inference model
     [inference_program, feed_target_names,
      fetch_targets] = fluid.io.load_inference_model(
-         args.model_save_path, executor=exe, params_filename=VARIABLE_FILENAME)
+         args.model_save_dir,
+         executor=exe,
+         model_filename=args.model_filename,
+         params_filename=args.params_filename)
 
     pred_res = exe.run(inference_program,
                        feed={feed_target_names[0]: data},
@@ -545,7 +552,7 @@ def predict_dygraph(args, data):
 
 def predict_dygraph_jit(args, data):
     with fluid.dygraph.guard(args.place):
-        model = fluid.dygraph.jit.load(args.model_save_path)
+        model = fluid.dygraph.jit.load(args.model_save_prefix)
         model.eval()
 
         pred_res = model(data)
@@ -554,7 +561,8 @@ def predict_dygraph_jit(args, data):
 
 
 def predict_analysis_inference(args, data):
-    output = PredictorTools(args.model_save_path, VARIABLE_FILENAME, [data])
+    output = PredictorTools(args.model_save_dir, args.model_filename,
+                            args.params_filename, [data])
     out = output()
     return out
 
@@ -565,7 +573,9 @@ def setUp(self):
 
     def train(self, model_name, to_static):
         self.args.model = model_name
-        self.args.model_save_path = model_name + ".inference.model"
+        self.args.model_save_prefix = "./inference/" + model_name
+        self.args.model_filename = model_name + INFER_MODEL_SUFFIX
+        self.args.params_filename = model_name + INFER_PARAMS_SUFFIX
         self.args.dy_state_dict_save_path = model_name + ".dygraph"
         out = train_mobilenet(self.args, to_static)
         return out
@@ -579,7 +589,9 @@ def assert_same_loss(self, model_name):
 
     def assert_same_predict(self, model_name):
         self.args.model = model_name
-        self.args.model_save_path = model_name + ".inference.model"
+        self.args.model_save_prefix = "./inference/" + model_name
+        self.args.model_filename = model_name + INFER_MODEL_SUFFIX
+        self.args.params_filename = model_name + INFER_PARAMS_SUFFIX
         self.args.dy_state_dict_save_path = model_name + ".dygraph"
         local_random = np.random.RandomState(SEED)
         image = local_random.random_sample([1, 3, 224, 224]).astype('float32')
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_resnet.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_resnet.py
index 203c8ddb3488c..095940d79eac6 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_resnet.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_resnet.py
@@ -24,7 +24,7 @@
 import paddle.fluid as fluid
 from paddle.fluid.dygraph import declarative, ProgramTranslator
 from paddle.fluid.dygraph.nn import BatchNorm, Conv2D, Linear, Pool2D
-from paddle.fluid.dygraph.io import VARIABLE_FILENAME
+from paddle.fluid.dygraph.io import INFER_MODEL_SUFFIX, INFER_PARAMS_SUFFIX
 
 from predictor_utils import PredictorTools
 
@@ -38,7 +38,11 @@
 epoch_num = 1
 place = fluid.CUDAPlace(0) if fluid.is_compiled_with_cuda() \
     else fluid.CPUPlace()
-MODEL_SAVE_PATH = "./resnet.inference.model"
+
+MODEL_SAVE_DIR = "./inference"
+MODEL_SAVE_PREFIX = "./inference/resnet"
+MODEL_FILENAME = "resnet" + INFER_MODEL_SUFFIX
+PARAMS_FILENAME = "resnet" + INFER_PARAMS_SUFFIX
 DY_STATE_DICT_SAVE_PATH = "./resnet.dygraph"
 program_translator = ProgramTranslator()
 
@@ -261,7 +265,7 @@ def train(to_static):
                             total_acc1.numpy() / total_sample, total_acc5.numpy() / total_sample, end_time-start_time))
                 if batch_id == 10:
                     if to_static:
-                        fluid.dygraph.jit.save(resnet, MODEL_SAVE_PATH)
+                        fluid.dygraph.jit.save(resnet, MODEL_SAVE_PREFIX)
                     else:
                         fluid.dygraph.save_dygraph(resnet.state_dict(),
                                                    DY_STATE_DICT_SAVE_PATH)
@@ -287,10 +291,14 @@ def predict_dygraph(data):
 
 
 def predict_static(data):
+    paddle.enable_static()
     exe = fluid.Executor(place)
     [inference_program, feed_target_names,
      fetch_targets] = fluid.io.load_inference_model(
-         MODEL_SAVE_PATH, executor=exe, params_filename=VARIABLE_FILENAME)
+         MODEL_SAVE_DIR,
+         executor=exe,
+         model_filename=MODEL_FILENAME,
+         params_filename=PARAMS_FILENAME)
 
     pred_res = exe.run(inference_program,
                        feed={feed_target_names[0]: data},
@@ -301,7 +309,7 @@ def predict_static(data):
 
 def predict_dygraph_jit(data):
     with fluid.dygraph.guard(place):
-        resnet = fluid.dygraph.jit.load(MODEL_SAVE_PATH)
+        resnet = fluid.dygraph.jit.load(MODEL_SAVE_PREFIX)
         resnet.eval()
 
         pred_res = resnet(data)
@@ -310,7 +318,8 @@ def predict_dygraph_jit(data):
 
 
 def predict_analysis_inference(data):
-    output = PredictorTools(MODEL_SAVE_PATH, VARIABLE_FILENAME, [data])
+    output = PredictorTools(MODEL_SAVE_DIR, MODEL_FILENAME, PARAMS_FILENAME,
+                            [data])
     out = output()
     return out
 
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_resnet_v2.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_resnet_v2.py
index 75c251253c05a..a8cfeb90bd814 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_resnet_v2.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_resnet_v2.py
@@ -34,7 +34,11 @@
 epoch_num = 1
 place = paddle.CUDAPlace(0) if paddle.is_compiled_with_cuda() \
     else paddle.CPUPlace()
-MODEL_SAVE_PATH = "./resnet_v2.inference.model"
+
+MODEL_SAVE_DIR = "./inference"
+MODEL_SAVE_PREFIX = "./inference/resnet_v2"
+MODEL_FILENAME = "resnet_v2" + paddle.fluid.dygraph.io.INFER_MODEL_SUFFIX
+PARAMS_FILENAME = "resnet_v2" + paddle.fluid.dygraph.io.INFER_PARAMS_SUFFIX
 DY_STATE_DICT_SAVE_PATH = "./resnet_v2.dygraph"
 program_translator = paddle.jit.ProgramTranslator()
 
@@ -255,7 +259,7 @@ def train(to_static):
                         total_acc1.numpy() / total_sample, total_acc5.numpy() / total_sample, end_time-start_time))
             if batch_id == 10:
                 if to_static:
-                    paddle.jit.save(resnet, MODEL_SAVE_PATH)
+                    paddle.jit.save(resnet, MODEL_SAVE_PREFIX)
                 else:
                     paddle.fluid.dygraph.save_dygraph(resnet.state_dict(),
                                                       DY_STATE_DICT_SAVE_PATH)
@@ -289,9 +293,10 @@ def predict_static(data):
     exe = paddle.static.Executor(place)
     [inference_program, feed_target_names,
      fetch_targets] = paddle.static.load_inference_model(
-         MODEL_SAVE_PATH,
+         MODEL_SAVE_DIR,
          executor=exe,
-         params_filename=paddle.fluid.dygraph.io.VARIABLE_FILENAME)
+         model_filename=MODEL_FILENAME,
+         params_filename=PARAMS_FILENAME)
 
     pred_res = exe.run(inference_program,
                        feed={feed_target_names[0]: data},
@@ -302,7 +307,7 @@ def predict_static(data):
 
 def predict_dygraph_jit(data):
     paddle.disable_static(place)
-    resnet = paddle.jit.load(MODEL_SAVE_PATH)
+    resnet = paddle.jit.load(MODEL_SAVE_PREFIX)
     resnet.eval()
 
     pred_res = resnet(data)
@@ -313,8 +318,8 @@ def predict_dygraph_jit(data):
 
 
 def predict_analysis_inference(data):
-    output = PredictorTools(MODEL_SAVE_PATH,
-                            paddle.fluid.dygraph.io.VARIABLE_FILENAME, [data])
+    output = PredictorTools(MODEL_SAVE_DIR, MODEL_FILENAME, PARAMS_FILENAME,
+                            [data])
     out = output()
     return out
 
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_save_inference_model.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_save_inference_model.py
index cf7708c675aa9..b431d5ae048a9 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_save_inference_model.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_save_inference_model.py
@@ -16,14 +16,14 @@
 
 import os
 import unittest
-
 import numpy as np
-import paddle.fluid as fluid
 
+import paddle
+import paddle.fluid as fluid
 from paddle.fluid.dygraph.dygraph_to_static import ProgramTranslator
 from paddle.fluid.dygraph.jit import declarative
 from paddle.fluid.dygraph.dygraph_to_static.partial_program import partial_program_from
-from paddle.fluid.dygraph.io import EXTRA_VAR_INFO_FILENAME
+from paddle.fluid.dygraph.io import INFER_MODEL_SUFFIX, INFER_PARAMS_SUFFIX, INFER_PARAMS_INFO_SUFFIX
 
 SEED = 2020
 
@@ -66,14 +66,13 @@ def test_save_inference_model(self):
                 adam.minimize(loss)
                 layer.clear_gradients()
             # test for saving model in dygraph.guard
-            infer_model_dir = "./test_dy2stat_save_inference_model_in_guard"
-            configs = fluid.dygraph.jit.SaveLoadConfig()
-            configs.output_spec = [pred]
+            infer_model_prefix = "./test_dy2stat_inference_in_guard/model"
+            infer_model_dir = "./test_dy2stat_inference_in_guard"
             fluid.dygraph.jit.save(
                 layer=layer,
-                model_path=infer_model_dir,
+                path=infer_model_prefix,
                 input_spec=[x],
-                configs=configs)
+                output_spec=[pred])
             # Check the correctness of the inference
             dygraph_out, _ = layer(x)
         self.check_save_inference_model(layer, [x_data], dygraph_out.numpy())
@@ -91,30 +90,30 @@ def check_save_inference_model(self,
 
         expected_persistable_vars = set([p.name for p in model.parameters()])
 
-        infer_model_dir = "./test_dy2stat_save_inference_model"
-        configs = fluid.dygraph.jit.SaveLoadConfig()
-        if fetch is not None:
-            configs.output_spec = fetch
-        configs.separate_params = True
+        infer_model_prefix = "./test_dy2stat_inference/model"
+        infer_model_dir = "./test_dy2stat_inference"
+        model_filename = "model" + INFER_MODEL_SUFFIX
+        params_filename = "model" + INFER_PARAMS_SUFFIX
         fluid.dygraph.jit.save(
             layer=model,
-            model_path=infer_model_dir,
+            path=infer_model_prefix,
             input_spec=feed if feed else None,
-            configs=configs)
-        saved_var_names = set([
-            filename for filename in os.listdir(infer_model_dir)
-            if filename != '__model__' and filename != EXTRA_VAR_INFO_FILENAME
-        ])
-        self.assertEqual(saved_var_names, expected_persistable_vars)
+            output_spec=fetch if fetch else None)
         # Check the correctness of the inference
-        infer_out = self.load_and_run_inference(infer_model_dir, inputs)
+        infer_out = self.load_and_run_inference(infer_model_dir, model_filename,
+                                                params_filename, inputs)
         self.assertTrue(np.allclose(gt_out, infer_out))
 
-    def load_and_run_inference(self, model_path, inputs):
+    def load_and_run_inference(self, model_path, model_filename,
+                               params_filename, inputs):
+        paddle.enable_static()
         exe = fluid.Executor(place)
         [inference_program, feed_target_names,
          fetch_targets] = fluid.io.load_inference_model(
-             dirname=model_path, executor=exe)
+             dirname=model_path,
+             executor=exe,
+             model_filename=model_filename,
+             params_filename=params_filename)
         results = exe.run(inference_program,
                           feed=dict(zip(feed_target_names, inputs)),
                           fetch_list=fetch_targets)
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_se_resnet.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_se_resnet.py
index 8f11a58588463..15cff501838a1 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_se_resnet.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_se_resnet.py
@@ -24,7 +24,7 @@
 from paddle.fluid.dygraph.nn import BatchNorm, Conv2D, Linear, Pool2D
 from paddle.fluid.dygraph import declarative
 from paddle.fluid.dygraph import ProgramTranslator
-from paddle.fluid.dygraph.io import VARIABLE_FILENAME
+from paddle.fluid.dygraph.io import INFER_MODEL_SUFFIX, INFER_PARAMS_SUFFIX
 
 from predictor_utils import PredictorTools
 
@@ -35,7 +35,10 @@
 EPOCH_NUM = 1
 PRINT_STEP = 2
 STEP_NUM = 10
-MODEL_SAVE_PATH = "./se_resnet.inference.model"
+MODEL_SAVE_DIR = "./inference"
+MODEL_SAVE_PREFIX = "./inference/se_resnet"
+MODEL_FILENAME = "se_resnet" + INFER_MODEL_SUFFIX
+PARAMS_FILENAME = "se_resnet" + INFER_PARAMS_SUFFIX
 DY_STATE_DICT_SAVE_PATH = "./se_resnet.dygraph"
 
 place = fluid.CUDAPlace(0) if fluid.is_compiled_with_cuda() \
@@ -383,10 +386,10 @@ def train(train_reader, to_static):
                 step_idx += 1
                 if step_idx == STEP_NUM:
                     if to_static:
-                        configs = fluid.dygraph.jit.SaveLoadConfig()
-                        configs.output_spec = [pred]
-                        fluid.dygraph.jit.save(se_resnext, MODEL_SAVE_PATH,
-                                               [img], configs)
+                        fluid.dygraph.jit.save(
+                            se_resnext,
+                            MODEL_SAVE_PREFIX, [img],
+                            output_spec=[pred])
                     else:
                         fluid.dygraph.save_dygraph(se_resnext.state_dict(),
                                                    DY_STATE_DICT_SAVE_PATH)
@@ -414,10 +417,14 @@ def predict_dygraph(data):
 
 
 def predict_static(data):
+    paddle.enable_static()
     exe = fluid.Executor(place)
     [inference_program, feed_target_names,
      fetch_targets] = fluid.io.load_inference_model(
-         MODEL_SAVE_PATH, executor=exe, params_filename=VARIABLE_FILENAME)
+         MODEL_SAVE_DIR,
+         executor=exe,
+         model_filename=MODEL_FILENAME,
+         params_filename=PARAMS_FILENAME)
 
     pred_res = exe.run(inference_program,
                        feed={feed_target_names[0]: data},
@@ -428,7 +435,7 @@ def predict_static(data):
 
 def predict_dygraph_jit(data):
     with fluid.dygraph.guard(place):
-        se_resnext = fluid.dygraph.jit.load(MODEL_SAVE_PATH)
+        se_resnext = fluid.dygraph.jit.load(MODEL_SAVE_PREFIX)
         se_resnext.eval()
 
         pred_res = se_resnext(data)
@@ -437,7 +444,8 @@ def predict_dygraph_jit(data):
 
 
 def predict_analysis_inference(data):
-    output = PredictorTools(MODEL_SAVE_PATH, VARIABLE_FILENAME, [data])
+    output = PredictorTools(MODEL_SAVE_DIR, MODEL_FILENAME, PARAMS_FILENAME,
+                            [data])
     out = output()
     return out
 
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_transformer.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_transformer.py
index 4fc8d27d30cb8..6721e7a51d2bc 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_transformer.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_transformer.py
@@ -32,6 +32,7 @@
 
 
 def train_static(args, batch_generator):
+    paddle.enable_static()
     paddle.manual_seed(SEED)
     paddle.framework.random._manual_program_seed(SEED)
     train_prog = fluid.Program()
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/transformer_util.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/transformer_util.py
index 8ebb99fda660e..e264a300d8c18 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/transformer_util.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/transformer_util.py
@@ -277,7 +277,8 @@ def load_dygraph(model_path, keep_name_table=False):
     To load python2 saved models in python3.
     """
     try:
-        para_dict, opti_dict = fluid.load_dygraph(model_path, keep_name_table)
+        para_dict, opti_dict = fluid.load_dygraph(
+            model_path, keep_name_table=keep_name_table)
         return para_dict, opti_dict
     except UnicodeDecodeError:
         warnings.warn(
@@ -287,7 +288,7 @@ def load_dygraph(model_path, keep_name_table=False):
         if six.PY3:
             load_bak = pickle.load
             pickle.load = partial(load_bak, encoding="latin1")
-            para_dict, opti_dict = fluid.load_dygraph(model_path,
-                                                      keep_name_table)
+            para_dict, opti_dict = fluid.load_dygraph(
+                model_path, keep_name_table=keep_name_table)
             pickle.load = load_bak
             return para_dict, opti_dict
diff --git a/python/paddle/fluid/tests/unittests/fleet_meta_optimizer_base.py b/python/paddle/fluid/tests/unittests/fleet_meta_optimizer_base.py
new file mode 100755
index 0000000000000..e7cdd49a32c26
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/fleet_meta_optimizer_base.py
@@ -0,0 +1,122 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import paddle
+from paddle import fluid
+import os
+import paddle.distributed.fleet as fleet
+import paddle.distributed.fleet.base.role_maker as role_maker
+
+
+class TestFleetMetaOptimizer(unittest.TestCase):
+    def setUp(self):
+        os.environ["PADDLE_TRAINER_ID"] = "1"
+        os.environ[
+            "PADDLE_TRAINER_ENDPOINTS"] = "127.0.0.1:36001,127.0.0.1:36002"
+
+    def net(self, main_prog, startup_prog):
+        with fluid.program_guard(main_prog, startup_prog):
+            with fluid.unique_name.guard():
+                role = role_maker.PaddleCloudRoleMaker(is_collective=True)
+                fleet.init(role)
+                input_x = paddle.fluid.layers.data(
+                    name="x", shape=[32], dtype='float32')
+                input_y = paddle.fluid.layers.data(
+                    name="y", shape=[1], dtype='int64')
+
+                fc_1 = paddle.fluid.layers.fc(input=input_x,
+                                              size=64,
+                                              act='tanh')
+                fc_2 = paddle.fluid.layers.fc(input=fc_1, size=256, act='tanh')
+                prediction = paddle.fluid.layers.fc(input=[fc_2],
+                                                    size=2,
+                                                    act='softmax')
+                cost = paddle.fluid.layers.cross_entropy(
+                    input=prediction, label=input_y)
+                avg_cost = paddle.fluid.layers.mean(x=cost)
+
+                strategy = paddle.distributed.fleet.DistributedStrategy()
+        return avg_cost, strategy
+
+    def optimizer(self,
+                  loss,
+                  strategy,
+                  train_prog,
+                  startup_prog,
+                  name='momentum'):
+        with fluid.program_guard(train_prog, startup_prog):
+            with fluid.unique_name.guard():
+                if name == 'momentum':
+                    optimizer = paddle.fluid.optimizer.Momentum(
+                        learning_rate=0.01, momentum=0.9)
+                elif name == 'adam':
+                    optimizer = paddle.fluid.optimizer.Adam(learning_rate=0.01)
+                optimizer = fleet.distributed_optimizer(
+                    optimizer, strategy=strategy)
+                optimizer.minimize(loss)
+
+    def set_strategy(self, strategy, name):
+        if name == 'amp':
+            strategy.amp = True
+            strategy.amp_configs = {
+                "init_loss_scaling": 32768,
+                "decr_every_n_nan_or_inf": 2,
+                "incr_every_n_steps": 1000,
+                "incr_ratio": 2.0,
+                "use_dynamic_loss_scaling": True,
+                "decr_ratio": 0.5,
+                "custom_white_list": ['softmax'],
+                "custom_black_list": ['tanh'],
+            }
+        elif name == 'dgc':
+            strategy.dgc = True
+            strategy.dgc_configs = {
+                "rampup_begin_step": 128,
+                "rampup_step": 100,
+                "sparsity": [0.996, 0.999]
+            }
+        elif name == 'recompute':
+            strategy.recompute = True
+            strategy.recompute_configs = {
+                "checkpoints": ["fc_0.tmp_2", "fc_1.tmp_2"]
+            }
+        elif name == 'lars':
+            strategy.lars = True
+            strategy.lars_configs = {
+                "lars_coeff": 0.001,
+                "lars_weight_decay": 0.0005,
+                "epsilon": 0,
+                "exclude_from_weight_decay": ["batch_norm", ".b"],
+            }
+        elif name == 'lamb':
+            strategy.lamb = True
+            strategy.lamb_configs = {
+                'lamb_weight_decay': 0.01,
+                'exclude_from_weight_decay': [],
+            }
+        elif name == 'localsgd':
+            strategy.localsgd = True
+            strategy.localsgd_configs = {
+                'k_steps': 1,
+                'begin_step': 1,
+            }
+        elif name == 'adaptive_localsgd':
+            strategy.adaptive_localsgd = True
+            strategy.adaptive_localsgd_configs = {
+                'init_k_steps': 1,
+                'begin_step': 1,
+            }
+        else:
+            raise NotImplementedError()
diff --git a/python/paddle/fluid/tests/unittests/mkldnn/test_pool2d_mkldnn_op.py b/python/paddle/fluid/tests/unittests/mkldnn/test_pool2d_mkldnn_op.py
index ee917b059b87c..467bac67051dd 100644
--- a/python/paddle/fluid/tests/unittests/mkldnn/test_pool2d_mkldnn_op.py
+++ b/python/paddle/fluid/tests/unittests/mkldnn/test_pool2d_mkldnn_op.py
@@ -61,6 +61,37 @@ def init_data_type(self):
 create_test_mkldnn_class(TestCase5)
 
 
+class TestAvgPoolAdaptive(TestPool2D_Op):
+    def init_adaptive(self):
+        self.adaptive = True
+
+    def init_pool_type(self):
+        self.pool_type = "avg"
+        self.pool2D_forward_naive = avg_pool2D_forward_naive
+
+    def init_kernel_type(self):
+        self.use_mkldnn = True
+
+    def init_test_case(self):
+        self.ksize = [1, 1]
+        self.strides = [1, 1]
+
+    def init_data_type(self):
+        self.dtype = np.float32
+
+    def init_global_pool(self):
+        self.global_pool = False
+
+
+class TestAvgPoolAdaptive2(TestAvgPoolAdaptive):
+    def init_test_case(self):
+        self.ksize = [2, 3]
+        self.strides = [1, 1]
+
+    def init_shape(self):
+        self.shape = [2, 3, 6, 6]
+
+
 class TestAsymPad(TestPool2D_Op):
     def init_test_case(self):
         self.ksize = [3, 3]
@@ -160,4 +191,6 @@ def init_shape(self):
 
 
 if __name__ == '__main__':
+    from paddle import enable_static
+    enable_static()
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_activation_op.py b/python/paddle/fluid/tests/unittests/test_activation_op.py
index 68a5fa5e8f367..4fed0c8552b44 100755
--- a/python/paddle/fluid/tests/unittests/test_activation_op.py
+++ b/python/paddle/fluid/tests/unittests/test_activation_op.py
@@ -25,10 +25,11 @@
 import paddle.nn.functional as F
 from paddle.fluid import compiler, Program, program_guard
 
+paddle.enable_static()
+
 
 class TestSqrtOpError(unittest.TestCase):
     def test_errors(self):
-        paddle.enable_static()
         with program_guard(Program(), Program()):
             # The input type of sqrt op must be Variable or numpy.ndarray.
             in1 = 1
@@ -45,7 +46,6 @@ def test_errors(self):
 
 class TestActivation(OpTest):
     def setUp(self):
-        paddle.enable_static()
         self.op_type = "exp"
         self.init_dtype()
         self.init_kernel_type()
@@ -74,7 +74,6 @@ def init_kernel_type(self):
 
 class TestParameter(object):
     def test_out_name(self):
-        paddle.enable_static()
         with fluid.program_guard(fluid.Program()):
             np_x = np.array([0.1])
             data = fluid.layers.data(name="X", shape=[1])
@@ -96,7 +95,6 @@ def test_dygraph(self):
 
 class TestSigmoid(TestActivation):
     def setUp(self):
-        paddle.enable_static()
         self.op_type = "sigmoid"
         self.init_dtype()
 
@@ -118,7 +116,6 @@ def test_check_grad(self):
 
 class TestLogSigmoid(TestActivation):
     def setUp(self):
-        paddle.enable_static()
         self.op_type = "logsigmoid"
         self.init_dtype()
 
@@ -192,7 +189,6 @@ def test_errors(self):
 
 class TestTanh(TestActivation, TestParameter):
     def setUp(self):
-        paddle.enable_static()
         self.op_type = "tanh"
         self.init_dtype()
         np.random.seed(1024)
@@ -273,7 +269,6 @@ def test_errors(self):
 
 class TestAtan(TestActivation, TestParameter):
     def setUp(self):
-        paddle.enable_static()
         self.op_type = "atan"
         self.init_dtype()
 
@@ -311,7 +306,6 @@ def test_dygraph(self):
 
 class TestSinh(TestActivation):
     def setUp(self):
-        paddle.enable_static()
         self.op_type = "sinh"
         self.init_dtype()
 
@@ -371,7 +365,6 @@ def test_backward(self):
 
 class TestSinhOpError(unittest.TestCase):
     def test_errors(self):
-        paddle.enable_static()
         with program_guard(Program()):
             # The input type must be Variable.
             self.assertRaises(TypeError, fluid.layers.sinh, 1)
@@ -385,7 +378,6 @@ def test_errors(self):
 
 class TestCosh(TestActivation):
     def setUp(self):
-        paddle.enable_static()
         self.op_type = "cosh"
         self.init_dtype()
 
@@ -445,7 +437,6 @@ def test_backward(self):
 
 class TestCoshOpError(unittest.TestCase):
     def test_errors(self):
-        paddle.enable_static()
         with program_guard(Program()):
             # The input type must be Variable.
             self.assertRaises(TypeError, fluid.layers.cosh, 1)
@@ -464,7 +455,6 @@ def ref_tanhshrink(x):
 
 class TestTanhshrink(TestActivation):
     def setUp(self):
-        paddle.enable_static()
         self.op_type = "tanh_shrink"
         self.init_dtype()
 
@@ -544,7 +534,6 @@ def ref_hardshrink(x, threshold):
 
 class TestHardShrink(TestActivation):
     def setUp(self):
-        paddle.enable_static()
         self.op_type = "hard_shrink"
         self.init_dtype()
 
@@ -575,7 +564,6 @@ def set_attrs(self):
 class TestHardShrinkAPI(unittest.TestCase):
     # test paddle.nn.Hardshrink, paddle.nn.functional.hardshrink
     def setUp(self):
-        paddle.enable_static()
         np.random.seed(1024)
         self.x_np = np.random.uniform(-1, 1, [10, 12]).astype('float32')
         self.place=paddle.CUDAPlace(0) if core.is_compiled_with_cuda() \
@@ -704,7 +692,6 @@ def ref_softshrink(x, threshold=0.5):
 
 class TestSoftshrink(TestActivation):
     def setUp(self):
-        paddle.enable_static()
         self.op_type = "softshrink"
         self.init_dtype()
 
@@ -784,7 +771,6 @@ def test_errors(self):
 
 class TestSqrt(TestActivation, TestParameter):
     def setUp(self):
-        paddle.enable_static()
         self.op_type = "sqrt"
         self.init_dtype()
 
@@ -803,7 +789,6 @@ def test_check_grad(self):
 
 class TestRsqrt(TestActivation):
     def setUp(self):
-        paddle.enable_static()
         self.op_type = "rsqrt"
         self.init_dtype()
 
@@ -822,7 +807,6 @@ def test_check_grad(self):
 
 class TestAbs(TestActivation):
     def setUp(self):
-        paddle.enable_static()
         self.op_type = "abs"
         self.init_dtype()
 
@@ -846,7 +830,6 @@ def test_check_grad(self):
 
 class TestCeil(TestActivation):
     def setUp(self):
-        paddle.enable_static()
         self.op_type = "ceil"
         self.init_dtype()
 
@@ -864,7 +847,6 @@ def test_check_grad(self):
 
 class TestFloor(TestActivation):
     def setUp(self):
-        paddle.enable_static()
         self.op_type = "floor"
         self.init_dtype()
 
@@ -884,7 +866,6 @@ def test_check_grad(self):
 
 class TestCos(TestActivation):
     def setUp(self):
-        paddle.enable_static()
         self.op_type = "cos"
         self.init_dtype()
 
@@ -903,7 +884,6 @@ def test_check_grad(self):
 
 class TestAcos(TestActivation):
     def setUp(self):
-        paddle.enable_static()
         self.op_type = "acos"
         self.init_dtype()
 
@@ -922,7 +902,6 @@ def test_check_grad(self):
 
 class TestSin(TestActivation, TestParameter):
     def setUp(self):
-        paddle.enable_static()
         self.op_type = "sin"
         self.init_dtype()
 
@@ -941,7 +920,6 @@ def test_check_grad(self):
 
 class TestAsin(TestActivation):
     def setUp(self):
-        paddle.enable_static()
         self.op_type = "asin"
         self.init_dtype()
 
@@ -960,7 +938,6 @@ def test_check_grad(self):
 
 class TestRound(TestActivation):
     def setUp(self):
-        paddle.enable_static()
         self.op_type = "round"
         self.init_dtype()
 
@@ -977,7 +954,6 @@ def test_check_grad(self):
 
 class TestRelu(TestActivation):
     def setUp(self):
-        paddle.enable_static()
         self.op_type = "relu"
         self.init_dtype()
 
@@ -1052,7 +1028,6 @@ def get_alpha(self):
         return 0.02
 
     def setUp(self):
-        paddle.enable_static()
         self.op_type = "leaky_relu"
         self.init_dtype()
         alpha = self.get_alpha()
@@ -1162,7 +1137,6 @@ def gelu(x, approximate):
 
 class TestGeluApproximate(TestActivation):
     def setUp(self):
-        paddle.enable_static()
         self.op_type = "gelu"
         self.init_dtype()
         approximate = True
@@ -1182,7 +1156,6 @@ def test_check_grad(self):
 
 class TestGelu(TestActivation):
     def setUp(self):
-        paddle.enable_static()
         self.op_type = "gelu"
         self.init_dtype()
         approximate = False
@@ -1254,7 +1227,6 @@ def test_errors(self):
 
 class TestBRelu(TestActivation):
     def setUp(self):
-        paddle.enable_static()
         self.op_type = "brelu"
         self.init_dtype()
 
@@ -1279,9 +1251,35 @@ def test_check_grad(self):
         self.check_grad(['X'], 'Out')
 
 
-class TestBReluOpError(unittest.TestCase):
+class TestBreluAPI(unittest.TestCase):
+    # test paddle.fluid.layers.brelu
+    def setUp(self):
+        np.random.seed(1024)
+        self.t_min = 0.
+        self.t_max = 24.
+        self.x_np = np.random.uniform(-1, 30, [10, 12]).astype('float32')
+        self.out_ref = np.copy(self.x_np)
+        self.out_ref[self.out_ref < self.t_min] = self.t_min
+        self.out_ref[self.out_ref > self.t_max] = self.t_max
+        self.out_ref = self.out_ref.astype('float32')
+        self.place=paddle.CUDAPlace(0) if core.is_compiled_with_cuda() \
+            else paddle.CPUPlace()
+
+    def test_fluid_api(self):
+        with paddle.static.program_guard(paddle.static.Program()):
+            x = paddle.static.data('X', [10, 12])
+            out = paddle.fluid.layers.brelu(x)
+            exe = paddle.static.Executor(self.place)
+            res = exe.run(feed={'X': self.x_np}, fetch_list=[out])
+            self.assertTrue(np.allclose(self.out_ref, res[0]))
+
+            paddle.disable_static(self.place)
+            x = paddle.to_tensor(self.x_np)
+            out = paddle.fluid.layers.brelu(x)
+            self.assertTrue(np.allclose(self.out_ref, out.numpy()))
+            paddle.enable_static()
+
     def test_errors(self):
-        paddle.enable_static()
         with program_guard(Program()):
             # The input type must be Variable.
             self.assertRaises(TypeError, fluid.layers.brelu, 1)
@@ -1303,7 +1301,6 @@ def ref_relu6(x, threshold=6.0):
 
 class TestRelu6(TestActivation):
     def setUp(self):
-        paddle.enable_static()
         self.op_type = "relu6"
         self.init_dtype()
 
@@ -1378,9 +1375,13 @@ def test_errors(self):
             F.relu6(x_fp16)
 
 
+def ref_hardswish(x, threshold=6.0, scale=6.0, offset=3.0):
+    return (x * np.minimum(np.maximum(x + offset, 0.), threshold) /
+            scale).astype(x.dtype)
+
+
 class TestHardSwish(TestActivation):
     def setUp(self):
-        paddle.enable_static()
         self.op_type = 'hard_swish'
         self.init_dtype()
 
@@ -1392,9 +1393,9 @@ def setUp(self):
         #the same with TestAbs
         x[np.abs(x + offset) < 0.005] = 0.02
         x[np.abs(x - threshold + offset) < 0.005] = threshold - offset + 0.02
-        out = x * np.minimum(np.maximum(x + offset, 0), threshold) / scale
+        out = ref_hardswish(x, threshold, scale, offset)
 
-        self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)}
+        self.inputs = {'X': x}
         self.attrs = {'threshold': threshold, 'scale': scale, 'offset': offset}
         self.outputs = {'Out': out}
 
@@ -1404,23 +1405,65 @@ def test_check_grad(self):
         self.check_grad(['X'], 'Out')
 
 
-class TestHardSwishOpError(unittest.TestCase):
-    def test_errors(self):
+class TestHardswishAPI(unittest.TestCase):
+    # test paddle.nn.Hardswish, paddle.nn.functional.hardswish
+    def setUp(self):
+        self.x_np = np.random.uniform(-1, 1, [10, 12]).astype(np.float64)
+        self.place=paddle.CUDAPlace(0) if core.is_compiled_with_cuda() \
+            else paddle.CPUPlace()
+
+    def test_static_api(self):
+        with paddle.static.program_guard(paddle.static.Program()):
+            x = paddle.data('X', self.x_np.shape, self.x_np.dtype)
+            out1 = F.hardswish(x)
+            m = paddle.nn.Hardswish()
+            out2 = m(x)
+            exe = paddle.static.Executor(self.place)
+            res = exe.run(feed={'X': self.x_np}, fetch_list=[out1, out2])
+        out_ref = ref_hardswish(self.x_np)
+        for r in res:
+            self.assertTrue(np.allclose(out_ref, r))
+
+    def test_dygraph_api(self):
+        paddle.disable_static(self.place)
+        x = paddle.to_tensor(self.x_np)
+        out1 = F.hardswish(x)
+        m = paddle.nn.Hardswish()
+        out2 = m(x)
+        out_ref = ref_hardswish(self.x_np)
+        for r in [out1, out2]:
+            self.assertTrue(np.allclose(out_ref, r.numpy()))
         paddle.enable_static()
-        with program_guard(Program()):
+
+    def test_fluid_api(self):
+        with fluid.program_guard(fluid.Program()):
+            x = fluid.data('X', self.x_np.shape, self.x_np.dtype)
+            out = fluid.layers.hard_swish(x)
+            exe = fluid.Executor(self.place)
+            res = exe.run(feed={'X': self.x_np}, fetch_list=[out])
+        out_ref = ref_hardswish(self.x_np)
+        self.assertTrue(np.allclose(out_ref, res[0]))
+
+        paddle.disable_static(self.place)
+        x = paddle.to_tensor(self.x_np)
+        out = paddle.fluid.layers.hard_swish(x)
+        self.assertTrue(np.allclose(out_ref, out.numpy()))
+        paddle.enable_static()
+
+    def test_errors(self):
+        with paddle.static.program_guard(paddle.static.Program()):
             # The input type must be Variable.
-            self.assertRaises(TypeError, fluid.layers.hard_swish, 1)
+            self.assertRaises(TypeError, F.hardswish, 1)
             # The input dtype must be float16, float32, float64.
-            x_int32 = fluid.data(name='x_int32', shape=[12, 10], dtype='int32')
-            self.assertRaises(TypeError, fluid.layers.hard_swish, x_int32)
+            x_int32 = paddle.data(name='x_int32', shape=[12, 10], dtype='int32')
+            self.assertRaises(TypeError, F.hardswish, x_int32)
             # support the input dtype is float16
-            x_fp16 = fluid.data(name='x_fp16', shape=[12, 10], dtype='float16')
-            fluid.layers.hard_swish(x_fp16)
+            x_fp16 = paddle.data(name='x_fp16', shape=[12, 10], dtype='float16')
+            F.hardswish(x_fp16)
 
 
 class TestSoftRelu(TestActivation):
     def setUp(self):
-        paddle.enable_static()
         self.op_type = "soft_relu"
         self.init_dtype()
 
@@ -1447,7 +1490,6 @@ def test_check_grad(self):
 
 class TestSoftReluOpError(unittest.TestCase):
     def test_errors(self):
-        paddle.enable_static()
         with program_guard(Program()):
             # The input type must be Variable.
             self.assertRaises(TypeError, fluid.layers.soft_relu, 1)
@@ -1466,7 +1508,6 @@ def elu(x, alpha):
 
 class TestELU(TestActivation):
     def setUp(self):
-        paddle.enable_static()
         self.op_type = "elu"
         self.init_dtype()
 
@@ -1540,7 +1581,6 @@ def test_errors(self):
 
 class TestReciprocal(TestActivation):
     def setUp(self):
-        paddle.enable_static()
         self.op_type = "reciprocal"
         self.init_dtype()
 
@@ -1559,7 +1599,6 @@ def test_check_grad(self):
 
 class TestLog(TestActivation):
     def setUp(self):
-        paddle.enable_static()
         self.op_type = "log"
         self.init_dtype()
 
@@ -1587,7 +1626,6 @@ def test_error(self):
 
 class TestLog1p(TestActivation):
     def setUp(self):
-        paddle.enable_static()
         self.op_type = "log1p"
         self.init_dtype()
 
@@ -1633,7 +1671,6 @@ def test_api(self):
 
 class TestSquare(TestActivation):
     def setUp(self):
-        paddle.enable_static()
         self.op_type = "square"
         self.init_dtype()
 
@@ -1652,7 +1689,6 @@ def test_check_grad(self):
 
 class TestPow(TestActivation):
     def setUp(self):
-        paddle.enable_static()
         self.op_type = "pow"
         self.init_dtype()
 
@@ -1672,7 +1708,6 @@ def test_check_grad(self):
 
 class TestPow_factor_tensor(TestActivation):
     def setUp(self):
-        paddle.enable_static()
         self.op_type = "pow"
         self.init_dtype()
 
@@ -1750,7 +1785,6 @@ def test_error(self):
 
 class TestSTanh(TestActivation):
     def setUp(self):
-        paddle.enable_static()
         self.op_type = "stanh"
         self.init_dtype()
 
@@ -1772,7 +1806,6 @@ def test_check_grad(self):
 
 class TestSTanhOpError(unittest.TestCase):
     def test_errors(self):
-        paddle.enable_static()
         with program_guard(Program()):
             # The input type must be Variable.
             self.assertRaises(TypeError, fluid.layers.stanh, 1)
@@ -1793,7 +1826,6 @@ def ref_softplus(x, beta=1, threshold=20):
 
 class TestSoftplus(TestActivation):
     def setUp(self):
-        paddle.enable_static()
         self.op_type = "softplus"
         self.init_dtype()
 
@@ -1877,7 +1909,6 @@ def ref_softsign(x):
 
 class TestSoftsign(TestActivation):
     def setUp(self):
-        paddle.enable_static()
         self.op_type = "softsign"
         self.init_dtype()
 
@@ -1948,23 +1979,24 @@ def test_errors(self):
             F.softsign(x_fp16)
 
 
+def ref_thresholded_relu(x, threshold=1.0):
+    out = (x > threshold) * x
+    return out
+
+
 class TestThresholdedRelu(TestActivation):
     def setUp(self):
-        paddle.enable_static()
         self.op_type = "thresholded_relu"
         self.init_dtype()
 
-        threshold = 0.25
-        self.delta = 0.005
-        np.random.seed(1024)
-        X = np.random.uniform(-1, 1, [11, 17]).astype(self.dtype)
-
-        # Same reason as TestAbs
-        X[np.abs(X - threshold) < self.delta] = threshold + 0.2
-        out = (X > threshold) * X
+        threshold = 15
 
-        self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(X)}
-        self.attrs = {'threshold': threshold}
+        np.random.seed(1024)
+        x = np.random.uniform(-20, 20, [10, 12]).astype(self.dtype)
+        x[np.abs(x) < 0.005] = 0.02
+        out = ref_thresholded_relu(x, threshold)
+        self.inputs = {'X': x}
+        self.attrs = {"threshold": threshold}
         self.outputs = {'Out': out}
 
     def test_check_grad(self):
@@ -1973,98 +2005,238 @@ def test_check_grad(self):
         self.check_grad(['X'], 'Out')
 
 
-class TestThresholdedReluOpError(unittest.TestCase):
+class TestThresholdedReluAPI(unittest.TestCase):
+    # test paddle.nn.ThresholdedReLU, paddle.nn.functional.thresholded_relu
+    def setUp(self):
+        self.threshold = 15
+        np.random.seed(1024)
+        self.x_np = np.random.uniform(-20, 20, [10, 12]).astype(np.float64)
+        self.x_np[np.abs(self.x_np) < 0.005] = 0.02
+        self.place=paddle.CUDAPlace(0) if core.is_compiled_with_cuda() \
+            else paddle.CPUPlace()
+
+    def test_static_api(self):
+        paddle.enable_static()
+        with paddle.static.program_guard(paddle.static.Program()):
+            x = paddle.data('X', self.x_np.shape, self.x_np.dtype)
+            out1 = F.thresholded_relu(x, self.threshold)
+            thresholded_relu = paddle.nn.ThresholdedReLU(self.threshold)
+            out2 = thresholded_relu(x)
+            exe = paddle.static.Executor(self.place)
+            res = exe.run(feed={'X': self.x_np}, fetch_list=[out1, out2])
+        out_ref = ref_thresholded_relu(self.x_np, self.threshold)
+        for r in res:
+            self.assertEqual(np.allclose(out_ref, r), True)
+
+    def test_dygraph_api(self):
+        paddle.disable_static(self.place)
+        x = paddle.to_tensor(self.x_np)
+        out1 = F.thresholded_relu(x, self.threshold)
+        thresholded_relu = paddle.nn.ThresholdedReLU(self.threshold)
+        out2 = thresholded_relu(x)
+        out_ref = ref_thresholded_relu(self.x_np, self.threshold)
+        for r in [out1, out2]:
+            self.assertEqual(np.allclose(out_ref, r.numpy()), True)
+        paddle.enable_static()
+
+    def test_fluid_api(self):
+        paddle.enable_static()
+        with fluid.program_guard(fluid.Program()):
+            x = fluid.data('X', self.x_np.shape, self.x_np.dtype)
+            out = fluid.layers.thresholded_relu(x, self.threshold)
+            exe = fluid.Executor(self.place)
+            res = exe.run(feed={'X': self.x_np}, fetch_list=[out])
+        out_ref = ref_thresholded_relu(self.x_np, self.threshold)
+        self.assertEqual(np.allclose(out_ref, res[0]), True)
+
     def test_errors(self):
         paddle.enable_static()
-        with program_guard(Program()):
+        with paddle.static.program_guard(paddle.static.Program()):
             # The input type must be Variable.
-            self.assertRaises(TypeError, fluid.layers.thresholded_relu, 1)
+            self.assertRaises(TypeError, F.thresholded_relu, 1)
             # The input dtype must be float16, float32, float64.
-            x_int32 = fluid.data(name='x_int32', shape=[12, 10], dtype='int32')
-            self.assertRaises(TypeError, fluid.layers.thresholded_relu, x_int32)
+            x_int32 = paddle.data(name='x_int32', shape=[12, 10], dtype='int32')
+            self.assertRaises(TypeError, F.thresholded_relu, x_int32)
             # support the input dtype is float16
-            x_fp16 = fluid.data(name='x_fp16', shape=[12, 10], dtype='float16')
-            fluid.layers.thresholded_relu(x_fp16)
+            x_fp16 = paddle.data(name='x_fp16', shape=[12, 10], dtype='float16')
+            F.thresholded_relu(x_fp16)
+
+
+def ref_hardsigmoid(x, slope=0.166666666666667, offset=0.5):
+    return np.maximum(np.minimum(x * slope + offset, 1.), 0.).astype(x.dtype)
 
 
 class TestHardSigmoid(TestActivation):
     def setUp(self):
-        paddle.enable_static()
         self.op_type = "hard_sigmoid"
-        self.init_dtype()
-
-        np.random.seed(1024)
-        X = np.random.uniform(-5, 5, [10, 12]).astype("float32")
-        slope = 0.2
-        offset = 0.5
-        lower_threshold = -offset / slope
-        upper_threshold = (1 - offset) / slope
+        self.dtype = 'float64'
+        self.slope = 0.166666666666667
+        self.offset = 0.5
+        self.set_attrs()
 
-        self.delta = 0.005
+        x = np.random.uniform(-5, 5, [10, 12]).astype(self.dtype)
+        lower_threshold = -self.offset / self.slope
+        upper_threshold = (1. - self.offset) / self.slope
 
         # Same reason as TestAbs
-        X[(X - lower_threshold) < self.delta] = lower_threshold - 0.02
-        X[(X - upper_threshold) < self.delta] = upper_threshold + 0.02
+        delta = 0.005
+        x[np.abs(x - lower_threshold) < delta] = lower_threshold - 0.02
+        x[np.abs(x - upper_threshold) < delta] = upper_threshold - 0.02
 
-        temp = X * slope + offset
-        out = np.maximum(0.0, np.minimum(1.0, temp))
+        out = ref_hardsigmoid(x, self.slope, self.offset)
 
-        self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(X)}
+        self.attrs = {'slope': self.slope, 'offset': self.offset}
+        self.inputs = {'X': x}
         self.outputs = {'Out': out}
 
-    def test_check_grad(self):
-        if self.dtype == np.float16:
-            return
-        self.check_grad(['X'], 'Out')
+    def set_attrs(self):
+        pass
 
 
-class TestHardSigmoidOpError(unittest.TestCase):
-    def test_errors(self):
+class TestHardSigmoidFP32(TestHardSigmoid):
+    def set_attrs(self):
+        self.dtype = 'float32'
+
+
+class TestHardSigmoidSlopeOffset(TestHardSigmoid):
+    def set_attrs(self):
+        self.slope = 0.2
+        self.offset = 0.4
+
+
+class TestHardsigmoidAPI(unittest.TestCase):
+    # test paddle.nn.Hardsigmoid, paddle.nn.functional.hardsigmoid
+    def setUp(self):
+        self.x_np = np.random.uniform(-1, 1, [10, 12]).astype(np.float64)
+        self.place=paddle.CUDAPlace(0) if core.is_compiled_with_cuda() \
+            else paddle.CPUPlace()
+
+    def test_static_api(self):
+        with paddle.static.program_guard(paddle.static.Program()):
+            x = paddle.data('X', self.x_np.shape, self.x_np.dtype)
+            out1 = F.hardsigmoid(x)
+            m = paddle.nn.Hardsigmoid()
+            out2 = m(x)
+            exe = paddle.static.Executor(self.place)
+            res = exe.run(feed={'X': self.x_np}, fetch_list=[out1, out2])
+        out_ref = ref_hardsigmoid(self.x_np)
+        for r in res:
+            self.assertTrue(np.allclose(out_ref, r))
+
+    def test_dygraph_api(self):
+        paddle.disable_static(self.place)
+        x = paddle.to_tensor(self.x_np)
+        out1 = F.hardsigmoid(x)
+        m = paddle.nn.Hardsigmoid()
+        out2 = m(x)
+        out_ref = ref_hardsigmoid(self.x_np)
+        for r in [out1, out2]:
+            self.assertTrue(np.allclose(out_ref, r.numpy()))
         paddle.enable_static()
-        with program_guard(Program()):
+
+    def test_fluid_api(self):
+        with fluid.program_guard(fluid.Program()):
+            x = fluid.data('X', self.x_np.shape, self.x_np.dtype)
+            out = fluid.layers.hard_sigmoid(x)
+            exe = fluid.Executor(self.place)
+            res = exe.run(feed={'X': self.x_np}, fetch_list=[out])
+        out_ref = ref_hardsigmoid(self.x_np, 0.2, 0.5)
+        self.assertTrue(np.allclose(out_ref, res[0]))
+
+        paddle.disable_static(self.place)
+        x = paddle.to_tensor(self.x_np)
+        out = paddle.fluid.layers.hard_sigmoid(x)
+        self.assertTrue(np.allclose(out_ref, out.numpy()))
+        paddle.enable_static()
+
+    def test_errors(self):
+        with paddle.static.program_guard(paddle.static.Program()):
             # The input type must be Variable.
-            self.assertRaises(TypeError, fluid.layers.hard_sigmoid, 1)
+            self.assertRaises(TypeError, F.hardsigmoid, 1)
             # The input dtype must be float16, float32, float64.
-            x_int32 = fluid.data(name='x_int32', shape=[12, 10], dtype='int32')
-            self.assertRaises(TypeError, fluid.layers.hard_sigmoid, x_int32)
+            x_int32 = paddle.data(name='x_int32', shape=[12, 10], dtype='int32')
+            self.assertRaises(TypeError, F.hardsigmoid, x_int32)
             # support the input dtype is float16
-            x_fp16 = fluid.data(name='x_fp16', shape=[12, 10], dtype='float16')
-            fluid.layers.hard_sigmoid(x_fp16)
+            x_fp16 = paddle.data(name='x_fp16', shape=[12, 10], dtype='float16')
+            F.hardsigmoid(x_fp16)
+
+
+def ref_swish(x):
+    out = x * expit(x)
+    return out
 
 
 class TestSwish(TestActivation):
     def setUp(self):
-        paddle.enable_static()
         self.op_type = "swish"
         self.init_dtype()
 
         np.random.seed(1024)
-        X = np.random.uniform(0.1, 1, [11, 17]).astype(self.dtype)
-        beta = 2.3
-        out = X * expit(beta * X)
-
-        self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(X)}
-        self.attrs = {'beta': beta}
+        x = np.random.uniform(-1, 1, [10, 12]).astype(self.dtype)
+        out = ref_swish(x)
+        self.inputs = {'X': x}
+        self.attrs = {'beta': 1.0}
         self.outputs = {'Out': out}
 
     def test_check_grad(self):
         if self.dtype == np.float16:
             return
-        self.check_grad(['X'], 'Out', max_relative_error=0.008)
+        self.check_grad(['X'], 'Out')
+
+
+class TestSwishAPI(unittest.TestCase):
+    # test paddle.nn.Swish, paddle.nn.functional.swish
+    def setUp(self):
+        np.random.seed(1024)
+        self.x_np = np.random.uniform(-1, 1, [10, 12]).astype(np.float64)
+        self.place=paddle.CUDAPlace(0) if core.is_compiled_with_cuda() \
+            else paddle.CPUPlace()
+
+    def test_static_api(self):
+        paddle.enable_static()
+        with paddle.static.program_guard(paddle.static.Program()):
+            x = paddle.data('X', self.x_np.shape, self.x_np.dtype)
+            out1 = F.swish(x)
+            swish = paddle.nn.Swish()
+            out2 = swish(x)
+            exe = paddle.static.Executor(self.place)
+            res = exe.run(feed={'X': self.x_np}, fetch_list=[out1, out2])
+        out_ref = ref_swish(self.x_np)
+        for r in res:
+            self.assertEqual(np.allclose(out_ref, r), True)
+
+    def test_dygraph_api(self):
+        paddle.disable_static(self.place)
+        x = paddle.to_tensor(self.x_np)
+        out1 = F.swish(x)
+        swish = paddle.nn.Swish()
+        out2 = swish(x)
+        out_ref = ref_swish(self.x_np)
+        for r in [out1, out2]:
+            self.assertEqual(np.allclose(out_ref, r.numpy()), True)
+        paddle.enable_static()
 
+    def test_fluid_api(self):
+        paddle.enable_static()
+        with fluid.program_guard(fluid.Program()):
+            x = fluid.data('X', self.x_np.shape, self.x_np.dtype)
+            out = fluid.layers.swish(x)
+            exe = fluid.Executor(self.place)
+            res = exe.run(feed={'X': self.x_np}, fetch_list=[out])
+        out_ref = ref_swish(self.x_np)
+        self.assertEqual(np.allclose(out_ref, res[0]), True)
 
-class TestSwishOpError(unittest.TestCase):
     def test_errors(self):
         paddle.enable_static()
-        with program_guard(Program()):
+        with paddle.static.program_guard(paddle.static.Program()):
             # The input type must be Variable.
-            self.assertRaises(TypeError, fluid.layers.swish, 1)
+            self.assertRaises(TypeError, F.swish, 1)
             # The input dtype must be float16, float32, float64.
-            x_int32 = fluid.data(name='x_int32', shape=[12, 10], dtype='int32')
-            self.assertRaises(TypeError, fluid.layers.swish, x_int32)
+            x_int32 = paddle.data(name='x_int32', shape=[12, 10], dtype='int32')
+            self.assertRaises(TypeError, F.swish, x_int32)
             # support the input dtype is float16
-            x_fp16 = fluid.data(name='x_fp16', shape=[12, 10], dtype='float16')
-            fluid.layers.swish(x_fp16)
+            x_fp16 = paddle.data(name='x_fp16', shape=[12, 10], dtype='float16')
+            F.swish(x_fp16)
 
 
 #------------------ Test Error Activation----------------------
diff --git a/python/paddle/fluid/tests/unittests/test_buffer_shared_memory_reuse_pass.py b/python/paddle/fluid/tests/unittests/test_buffer_shared_memory_reuse_pass.py
index 9dd617f90b65d..7bdfa3d2dfd74 100644
--- a/python/paddle/fluid/tests/unittests/test_buffer_shared_memory_reuse_pass.py
+++ b/python/paddle/fluid/tests/unittests/test_buffer_shared_memory_reuse_pass.py
@@ -34,6 +34,7 @@ class InplaceTestBase(unittest.TestCase):
     def initParameter(self):
         self.use_cuda = True
         self.fuse_all_optimizer_ops = False
+        self.fuse_all_reduce_ops = False
 
     def setUp(self):
         paddle.enable_static()
@@ -93,6 +94,7 @@ def check_single_card_fetch_var(self):
                 build_strategy.memory_optimize = memory_optimize
                 build_strategy.enable_inplace = enable_inplace
                 build_strategy.fuse_all_optimizer_ops = self.fuse_all_optimizer_ops
+                build_strategy.fuse_all_reduce_ops = self.fuse_all_reduce_ops
                 compiled_prog = fluid.CompiledProgram(prog).with_data_parallel(
                     loss_name=loss.name,
                     build_strategy=build_strategy,
@@ -146,6 +148,7 @@ def check_multi_card_fetch_var(self):
                 build_strategy.memory_optimize = memory_optimize
                 build_strategy.enable_inplace = enable_inplace
                 build_strategy.fuse_all_optimizer_ops = self.fuse_all_optimizer_ops
+                build_strategy.fuse_all_reduce_ops = self.fuse_all_reduce_ops
                 compiled_program = fluid.CompiledProgram(
                     prog).with_data_parallel(
                         loss_name=loss.name,
@@ -175,6 +178,7 @@ class CUDAInplaceTest(InplaceTestBase):
     def initParameter(self):
         self.use_cuda = True
         self.fuse_all_optimizer_ops = False
+        self.fuse_all_reduce_ops = False
 
     def test_multi_card_fetch_var(self):
         self.check_multi_card_fetch_var()
@@ -187,6 +191,7 @@ class CPUInplaceTest(InplaceTestBase):
     def initParameter(self):
         self.use_cuda = False
         self.fuse_all_optimizer_ops = False
+        self.fuse_all_reduce_ops = False
 
     def test_multi_card_fetch_var(self):
         self.check_multi_card_fetch_var()
diff --git a/python/paddle/fluid/tests/unittests/test_buffer_shared_memory_reuse_pass_and_fuse_optimization_op_pass.py b/python/paddle/fluid/tests/unittests/test_buffer_shared_memory_reuse_pass_and_fuse_optimization_op_pass.py
index 0b14cab4a7846..e9e62bee00680 100644
--- a/python/paddle/fluid/tests/unittests/test_buffer_shared_memory_reuse_pass_and_fuse_optimization_op_pass.py
+++ b/python/paddle/fluid/tests/unittests/test_buffer_shared_memory_reuse_pass_and_fuse_optimization_op_pass.py
@@ -20,6 +20,7 @@ class CUDAInplaceTestWithFuseOptimizationOps(InplaceTestBase):
     def initParameter(self):
         self.use_cuda = True
         self.fuse_all_optimizer_ops = True
+        self.fuse_all_reduce_ops = False
 
     def test_multi_card_fetch_var(self):
         self.check_multi_card_fetch_var()
@@ -32,6 +33,7 @@ class CPUInplaceTestWithFuseOptimizationOps(InplaceTestBase):
     def initParameter(self):
         self.use_cuda = False
         self.fuse_all_optimizer_ops = True
+        self.fuse_all_reduce_ops = False
 
     def test_multi_card_fetch_var(self):
         self.check_multi_card_fetch_var()
diff --git a/python/paddle/fluid/tests/unittests/test_dgc_optimizer.py b/python/paddle/fluid/tests/unittests/test_dgc_optimizer.py
index 49b93e0dfaaac..d615f7cb7044e 100644
--- a/python/paddle/fluid/tests/unittests/test_dgc_optimizer.py
+++ b/python/paddle/fluid/tests/unittests/test_dgc_optimizer.py
@@ -16,12 +16,14 @@
 
 import unittest
 
+import paddle
 import paddle.fluid.framework as framework
 import paddle.fluid.optimizer as optimizer
 import paddle.fluid.regularizer as regularizer
 import paddle.fluid.clip as clip
 import paddle.compat as cpt
 from paddle.fluid.backward import append_backward
+paddle.enable_static()
 
 
 class TestDGCMomentumOptimizer(unittest.TestCase):
@@ -86,13 +88,17 @@ def check_dgc_momentum_optimizer(self,
         block.append_op(
             type="mean", inputs={"X": mul_out}, outputs={"Out": mean_out})
         # params_grads = append_backward(mean_out)
-        params_grads = dgc_momentum_optimizer.backward(mean_out)
+        params_grads = dgc_momentum_optimizer.backward(
+            mean_out, startup_program=init_program)
+
+        with framework.program_guard(program, init_program):
+            opts = dgc_momentum_optimizer.apply_gradients(params_grads)
+
         accumulator_count = 1 if name == "momentum" else 2
         self.assertEqual(len(params_grads), 1)
         self.assertEqual(
             len(dgc_momentum_optimizer.get_accumulators()), accumulator_count)
-        with framework.program_guard(program, init_program):
-            opts = dgc_momentum_optimizer.apply_gradients(params_grads)
+
         self.assertEqual(len(opts), 2)
         sgd_op = opts[-1]
         self.assertEqual([op.type for op in opts], ["scale", name])
@@ -108,8 +114,11 @@ def check_dgc_momentum_optimizer(self,
         self.assertTrue(mul_x.name in velocity_acc)
 
         # Check init_program
+        # dgc not apply include: lr, dgc(count, nranks, begin step), (u,)
+        # dgc apply include: lr, dgc(count, nranks, begin_step), (u,v,k,encode,gather)
+        init_ops_count = 5 if name == "momentum" else 9
         init_ops = init_program.global_block().ops
-        self.assertEqual(len(init_ops), 1)
+        self.assertEqual(len(init_ops), init_ops_count)
         self.assertEqual(init_ops[0].type, "fill_constant")
         self.assertAlmostEqual(init_ops[0].attr('value'), learning_rate)
 
diff --git a/python/paddle/fluid/tests/unittests/test_directory_migration.py b/python/paddle/fluid/tests/unittests/test_directory_migration.py
index 2f35b45aa670c..fd014f3b4ecaf 100644
--- a/python/paddle/fluid/tests/unittests/test_directory_migration.py
+++ b/python/paddle/fluid/tests/unittests/test_directory_migration.py
@@ -43,15 +43,14 @@ def test_new_directory(self):
             'paddle.distributed.prepare_context', 'paddle.DataParallel',
             'paddle.jit', 'paddle.jit.TracedLayer', 'paddle.jit.to_static',
             'paddle.jit.ProgramTranslator', 'paddle.jit.TranslatedLayer',
-            'paddle.jit.save', 'paddle.jit.load', 'paddle.SaveLoadConfig',
-            'paddle.NoamDecay', 'paddle.PiecewiseDecay',
-            'paddle.NaturalExpDecay', 'paddle.ExponentialDecay',
-            'paddle.InverseTimeDecay', 'paddle.PolynomialDecay',
-            'paddle.CosineDecay', 'paddle.static.Executor',
-            'paddle.static.global_scope', 'paddle.static.scope_guard',
-            'paddle.static.append_backward', 'paddle.static.gradients',
-            'paddle.static.BuildStrategy', 'paddle.static.CompiledProgram',
-            'paddle.static.ExecutionStrategy',
+            'paddle.jit.save', 'paddle.jit.load', 'paddle.NoamDecay',
+            'paddle.PiecewiseDecay', 'paddle.NaturalExpDecay',
+            'paddle.ExponentialDecay', 'paddle.InverseTimeDecay',
+            'paddle.PolynomialDecay', 'paddle.CosineDecay',
+            'paddle.static.Executor', 'paddle.static.global_scope',
+            'paddle.static.scope_guard', 'paddle.static.append_backward',
+            'paddle.static.gradients', 'paddle.static.BuildStrategy',
+            'paddle.static.CompiledProgram', 'paddle.static.ExecutionStrategy',
             'paddle.static.default_main_program',
             'paddle.static.default_startup_program', 'paddle.static.Program',
             'paddle.static.name_scope', 'paddle.static.program_guard',
@@ -65,11 +64,10 @@ def test_new_directory(self):
             'paddle.static.nn.create_parameter',
             'paddle.static.nn.crf_decoding', 'paddle.static.nn.data_norm',
             'paddle.static.nn.deformable_conv', 'paddle.static.nn.group_norm',
-            'paddle.static.nn.hsigmoid', 'paddle.static.nn.instance_norm',
-            'paddle.static.nn.layer_norm', 'paddle.static.nn.multi_box_head',
-            'paddle.static.nn.nce', 'paddle.static.nn.prelu',
-            'paddle.static.nn.row_conv', 'paddle.static.nn.spectral_norm',
-            'paddle.static.nn.embedding'
+            'paddle.static.nn.instance_norm', 'paddle.static.nn.layer_norm',
+            'paddle.static.nn.multi_box_head', 'paddle.static.nn.nce',
+            'paddle.static.nn.prelu', 'paddle.static.nn.row_conv',
+            'paddle.static.nn.spectral_norm', 'paddle.static.nn.embedding'
         ]
 
         import_file = 'run_import_modules.py'
@@ -104,9 +102,7 @@ def test_old_directory(self):
             'paddle.imperative.TracedLayer', 'paddle.imperative.declarative',
             'paddle.imperative.ProgramTranslator',
             'paddle.imperative.TranslatedLayer', 'paddle.imperative.jit.save',
-            'paddle.imperative.jit.load',
-            'paddle.imperative.jit.SaveLoadConfig',
-            'paddle.imperative.NoamDecay'
+            'paddle.imperative.jit.load', 'paddle.imperative.NoamDecay'
             'paddle.imperative.PiecewiseDecay',
             'paddle.imperative.NaturalExpDecay',
             'paddle.imperative.ExponentialDecay',
diff --git a/python/paddle/fluid/tests/unittests/test_fc_op.py b/python/paddle/fluid/tests/unittests/test_fc_op.py
index ec30cb70c5790..1272d82dfdd1d 100644
--- a/python/paddle/fluid/tests/unittests/test_fc_op.py
+++ b/python/paddle/fluid/tests/unittests/test_fc_op.py
@@ -149,9 +149,9 @@ def run_program(num_flatten_dims):
                     append_batch_size=False,
                     dtype="float32")
 
-                out = fluid.layers.fc(input=x,
-                                      size=1,
-                                      num_flatten_dims=num_flatten_dims)
+                out = paddle.static.nn.fc(x=x,
+                                          size=1,
+                                          num_flatten_dims=num_flatten_dims)
 
             place = fluid.CPUPlace() if not core.is_compiled_with_cuda(
             ) else fluid.CUDAPlace(0)
diff --git a/python/paddle/fluid/tests/unittests/test_fleet_amp_meta_optimizer.py b/python/paddle/fluid/tests/unittests/test_fleet_amp_meta_optimizer.py
index 362428631e68c..6bc1a310d0aea 100644
--- a/python/paddle/fluid/tests/unittests/test_fleet_amp_meta_optimizer.py
+++ b/python/paddle/fluid/tests/unittests/test_fleet_amp_meta_optimizer.py
@@ -12,57 +12,97 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import paddle.distributed.fleet as fleet
-import paddle.distributed.fleet.base.role_maker as role_maker
 import unittest
 import paddle
+import paddle.fluid as fluid
+import paddle.distributed.fleet as fleet
+from paddle.distributed.fleet.meta_optimizers import AMPOptimizer
 import os
+from fleet_meta_optimizer_base import TestFleetMetaOptimizer
 
 paddle.enable_static()
 
 
-class TestFleetAMPOptimizer(unittest.TestCase):
-    def setUp(self):
-        os.environ["PADDLE_TRAINER_ID"] = "0"
-        os.environ["PADDLE_TRAINER_ENDPOINTS"] = "127.0.0.1:36001"
+class TestFleetAMPOptimizer(TestFleetMetaOptimizer):
+    def test_amp_optimizer_backward(self):
+        """ test amp optimizer backward """
+        train_prog, startup_prog = fluid.Program(), fluid.Program()
+        avg_cost, strategy = self.net(train_prog, startup_prog)
+
+        opt = fluid.optimizer.MomentumOptimizer(
+            learning_rate=0.001, momentum=0.9)
+        opt = AMPOptimizer(opt)
+        opt.user_defined_strategy = strategy
+        params_grads = opt.backward(avg_cost, startup_prog)
+
+        ops = [op.type for op in avg_cost.block.ops]
+        self.assertIn('cast', ops)
+        self.assertNotIn('check_finite_and_unscale', ops)
+
+    def test_amp_optimizer_backward_gradients(self):
+        """ test amp optimizer backward + gradients"""
+        train_prog, startup_prog = fluid.Program(), fluid.Program()
+        avg_cost, strategy = self.net(train_prog, startup_prog)
+
+        opt = fluid.optimizer.MomentumOptimizer(
+            learning_rate=0.001, momentum=0.9)
+        opt = AMPOptimizer(opt)
+        opt.user_defined_strategy = strategy
+        params_grads = opt.backward(avg_cost, startup_prog)
+        with fluid.program_guard(train_prog, startup_prog):
+            opt.apply_gradients(params_grads)
+
+        ops = [op.type for op in avg_cost.block.ops]
+        self.assertIn('cast', ops)
+        self.assertIn('check_finite_and_unscale', ops)
+
+    def test_amp_optimizer_backward_optimize(self):
+        """ test amp optimizer backward + optimizer """
+        train_prog, startup_prog = fluid.Program(), fluid.Program()
+        avg_cost, strategy = self.net(train_prog, startup_prog)
+
+        opt = fluid.optimizer.MomentumOptimizer(
+            learning_rate=0.001, momentum=0.9)
+        opt = AMPOptimizer(opt)
+        opt.user_defined_strategy = strategy
+        params_grads = opt.backward(avg_cost, startup_prog)
+        opt.apply_optimize(avg_cost, startup_prog, params_grads)
+
+        ops = [op.type for op in avg_cost.block.ops]
+        self.assertIn('cast', ops)
+        self.assertIn('check_finite_and_unscale', ops)
 
     def test_amp_optimizer(self):
-        role = role_maker.PaddleCloudRoleMaker(is_collective=True)
-        fleet.init(role)
-        input_x = paddle.fluid.layers.data(
-            name="x", shape=[32], dtype='float32')
-        input_y = paddle.fluid.layers.data(name="y", shape=[1], dtype='int64')
-
-        fc_1 = paddle.fluid.layers.fc(input=input_x, size=64, act='tanh')
-        fc_2 = paddle.fluid.layers.fc(input=fc_1, size=64, act='tanh')
-        prediction = paddle.fluid.layers.fc(input=[fc_2], size=2, act='softmax')
-        cost = paddle.fluid.layers.cross_entropy(
-            input=prediction, label=input_y)
-        avg_cost = paddle.fluid.layers.mean(x=cost)
-
-        strategy = paddle.distributed.fleet.DistributedStrategy()
-        strategy.amp = True
-        strategy.amp_configs = {
-            "init_loss_scaling": 32768,
-            "decr_every_n_nan_or_inf": 2,
-            "incr_every_n_steps": 1000,
-            "incr_ratio": 2.0,
-            "use_dynamic_loss_scaling": True,
-            "decr_ratio": 0.5,
-            "custom_white_list": ['softmax'],
-            "custom_black_list": ['tanh'],
-        }
-
-        optimizer = paddle.fluid.optimizer.SGD(learning_rate=0.01)
-        optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy)
-        optimizer.minimize(avg_cost)
+        """ test amp """
+        train_prog, startup_prog = fluid.Program(), fluid.Program()
+        avg_cost, strategy = self.net(train_prog, startup_prog)
+        self.set_strategy(strategy, 'amp')
+        self.optimizer(avg_cost, strategy, train_prog, startup_prog)
+
+        ops = [op.type for op in avg_cost.block.ops]
+        self.assertIn('cast', ops)
+        self.assertIn('check_finite_and_unscale', ops)
+
+    def test_amp_recompute_optimizer(self):
+        """ test amp + recompute """
+        train_prog, startup_prog = fluid.Program(), fluid.Program()
+        avg_cost, strategy = self.net(train_prog, startup_prog)
+        self.set_strategy(strategy, 'amp')
+        self.set_strategy(strategy, 'recompute')
+        self.optimizer(avg_cost, strategy, train_prog, startup_prog)
 
         strategy = fleet._final_strategy()
 
         ops = [op.type for op in avg_cost.block.ops]
+        outs = [
+            op.output('Out')[0] for op in avg_cost.block.ops if op.type == 'mul'
+        ]
         self.assertIn('cast', ops)
         self.assertIn('check_finite_and_unscale', ops)
 
+        # recompute
+        self.assertIn('subprog', ''.join(outs))
+
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_fleet_dgc_meta_optimizer.py b/python/paddle/fluid/tests/unittests/test_fleet_dgc_meta_optimizer.py
index 55d4ff7726aac..0faafd76a799d 100755
--- a/python/paddle/fluid/tests/unittests/test_fleet_dgc_meta_optimizer.py
+++ b/python/paddle/fluid/tests/unittests/test_fleet_dgc_meta_optimizer.py
@@ -17,65 +17,82 @@
 from paddle import fluid
 import os
 import paddle.distributed.fleet as fleet
+from fleet_meta_optimizer_base import TestFleetMetaOptimizer
+from paddle.distributed.fleet.meta_optimizers import DGCOptimizer
 import paddle.distributed.fleet.base.role_maker as role_maker
 
+paddle.enable_static()
 
-class TestFleetDGCOptimizer(unittest.TestCase):
-    def setUp(self):
-        os.environ["PADDLE_TRAINER_ID"] = "1"
-        os.environ[
-            "PADDLE_TRAINER_ENDPOINTS"] = "127.0.0.1:36001,127.0.0.1:36002"
-
-    def net(self, main_prog, startup_prog):
-        with fluid.program_guard(main_prog, startup_prog):
-            with fluid.unique_name.guard():
-                role = role_maker.PaddleCloudRoleMaker(is_collective=True)
-                fleet.init(role)
-                input_x = paddle.fluid.layers.data(
-                    name="x", shape=[32], dtype='float32')
-                input_y = paddle.fluid.layers.data(
-                    name="y", shape=[1], dtype='int64')
-
-                fc_1 = paddle.fluid.layers.fc(input=input_x,
-                                              size=64,
-                                              act='tanh')
-                fc_2 = paddle.fluid.layers.fc(input=fc_1, size=256, act='tanh')
-                prediction = paddle.fluid.layers.fc(input=[fc_2],
-                                                    size=2,
-                                                    act='softmax')
-                cost = paddle.fluid.layers.cross_entropy(
-                    input=prediction, label=input_y)
-                avg_cost = paddle.fluid.layers.mean(x=cost)
-
-                strategy = paddle.distributed.fleet.DistributedStrategy()
-                strategy.dgc = True
-                strategy.dgc_configs = {
-                    "rampup_begin_step": 128,
-                    "rampup_step": 100,
-                    "sparsity": [0.996, 0.999]
-                }
-        return avg_cost, strategy
+
+class TestFleetDGCOptimizer(TestFleetMetaOptimizer):
+    def test_dgc_optimizer_backward(self):
+        """ test dgc optimizer backward """
+        train_prog, startup_prog = fluid.Program(), fluid.Program()
+        avg_cost, strategy = self.net(train_prog, startup_prog)
+
+        self.set_strategy(strategy, 'dgc')
+        opt = fluid.optimizer.MomentumOptimizer(
+            learning_rate=0.001, momentum=0.9)
+        dgc_opt = DGCOptimizer(opt)
+        role = role_maker.PaddleCloudRoleMaker(is_collective=True)
+        dgc_opt._set_basic_info(avg_cost, role, opt, strategy)
+        params_grads = dgc_opt.backward(avg_cost, startup_prog)
+
+        ops = [op.type for op in avg_cost.block.ops]
+        self.assertNotIn('dgc', ops)
+
+    def test_dgc_optimizer_gradients(self):
+        """ test dgc optimizer backward + gradients """
+        train_prog, startup_prog = fluid.Program(), fluid.Program()
+        avg_cost, strategy = self.net(train_prog, startup_prog)
+
+        self.set_strategy(strategy, 'dgc')
+        opt = fluid.optimizer.MomentumOptimizer(
+            learning_rate=0.001, momentum=0.9)
+        dgc_opt = DGCOptimizer(opt)
+        role = role_maker.PaddleCloudRoleMaker(is_collective=True)
+        dgc_opt._set_basic_info(avg_cost, role, opt, strategy)
+        params_grads = dgc_opt.backward(avg_cost, startup_prog)
+        with fluid.program_guard(train_prog, startup_prog):
+            dgc_opt.apply_gradients(params_grads)
+
+        ops = [op.type for op in avg_cost.block.ops]
+        self.assertIn('dgc', ops)
+        self.assertIn('dgc_momentum', ops)
+
+    def test_dgc_optimizer_optimize(self):
+        """ test dgc optimizer backward + optimize """
+        train_prog, startup_prog = fluid.Program(), fluid.Program()
+        avg_cost, strategy = self.net(train_prog, startup_prog)
+
+        self.set_strategy(strategy, 'dgc')
+        opt = fluid.optimizer.MomentumOptimizer(
+            learning_rate=0.001, momentum=0.9)
+        dgc_opt = DGCOptimizer(opt)
+        role = role_maker.PaddleCloudRoleMaker(is_collective=True)
+        dgc_opt._set_basic_info(avg_cost, role, opt, strategy)
+        params_grads = dgc_opt.backward(avg_cost, startup_prog)
+        dgc_opt.apply_optimize(avg_cost, startup_prog, params_grads)
+
+        ops = [op.type for op in avg_cost.block.ops]
+        self.assertIn('dgc', ops)
+        self.assertIn('dgc_momentum', ops)
 
     def test_dgc_optimizer(self):
-        startup_prog = fluid.Program()
-        train_prog = fluid.Program()
+        train_prog, startup_prog = fluid.Program(), fluid.Program()
         avg_cost, strategy = self.net(train_prog, startup_prog)
-        optimizer = paddle.fluid.optimizer.Momentum(
-            learning_rate=0.01, momentum=0.9)
-        optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy)
-        optimizer.minimize(avg_cost)
+        self.set_strategy(strategy, 'dgc')
+        self.optimizer(avg_cost, strategy, train_prog, startup_prog)
 
         ops = [op.type for op in avg_cost.block.ops]
         self.assertIn('dgc', ops)
         self.assertIn('dgc_momentum', ops)
 
     def test_dgc_not_apply_with_adam(self):
-        startup_prog = fluid.Program()
-        train_prog = fluid.Program()
+        train_prog, startup_prog = fluid.Program(), fluid.Program()
         avg_cost, strategy = self.net(train_prog, startup_prog)
-        optimizer = paddle.fluid.optimizer.Adam(learning_rate=0.01)
-        optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy)
-        optimizer.minimize(avg_cost)
+        self.set_strategy(strategy, 'dgc')
+        self.optimizer(avg_cost, strategy, train_prog, startup_prog, 'adam')
 
         ops = [op.type for op in avg_cost.block.ops]
         self.assertNotIn('dgc', ops)
@@ -85,18 +102,32 @@ def test_dgc_not_apply_with_one_worker(self):
         os.environ["PADDLE_TRAINER_ID"] = "0"
         os.environ["PADDLE_TRAINER_ENDPOINTS"] = "127.0.0.1:36001"
 
-        startup_prog = fluid.Program()
-        train_prog = fluid.Program()
+        train_prog, startup_prog = fluid.Program(), fluid.Program()
         avg_cost, strategy = self.net(train_prog, startup_prog)
-        optimizer = paddle.fluid.optimizer.Momentum(
-            learning_rate=0.01, momentum=0.9)
-        optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy)
-        optimizer.minimize(avg_cost)
+        self.set_strategy(strategy, 'dgc')
+        self.optimizer(avg_cost, strategy, train_prog, startup_prog)
 
         ops = [op.type for op in avg_cost.block.ops]
         self.assertNotIn('dgc', ops)
         self.assertNotIn('dgc_momentum', ops)
 
+    def test_dgc_recompute_optimizer(self):
+        train_prog, startup_prog = fluid.Program(), fluid.Program()
+        avg_cost, strategy = self.net(train_prog, startup_prog)
+        self.set_strategy(strategy, 'dgc')
+        self.set_strategy(strategy, 'recompute')
+        self.optimizer(avg_cost, strategy, train_prog, startup_prog)
+
+        ops = [op.type for op in avg_cost.block.ops]
+        outs = [
+            op.output('Out')[0] for op in avg_cost.block.ops if op.type == 'mul'
+        ]
+        self.assertIn('dgc', ops)
+        self.assertIn('dgc_momentum', ops)
+
+        # recompute
+        self.assertIn('subprog', ''.join(outs))
+
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_fleet_localsgd_meta_optimizer.py b/python/paddle/fluid/tests/unittests/test_fleet_localsgd_meta_optimizer.py
index f5347b0c665e2..bafb2419123b0 100644
--- a/python/paddle/fluid/tests/unittests/test_fleet_localsgd_meta_optimizer.py
+++ b/python/paddle/fluid/tests/unittests/test_fleet_localsgd_meta_optimizer.py
@@ -16,71 +16,87 @@
 import paddle
 import os
 
+import paddle
+import paddle.fluid as fluid
 import paddle.distributed.fleet as fleet
 import paddle.distributed.fleet.base.role_maker as role_maker
+from fleet_meta_optimizer_base import TestFleetMetaOptimizer
 
+paddle.enable_static()
 
-class TestFleetLocalSGDMetaOptimizer(unittest.TestCase):
-    def setUp(self):
-        os.environ["PADDLE_TRAINER_ID"] = "1"
-        os.environ[
-            "PADDLE_TRAINER_ENDPOINTS"] = "127.0.0.1:36001,127.0.0.1:36002"
 
+class TestFleetLocalSGDMetaOptimizer(TestFleetMetaOptimizer):
     def test_localsgd_optimizer(self):
-        role = role_maker.PaddleCloudRoleMaker(is_collective=True)
-        fleet.init(role)
-        input_x = paddle.fluid.layers.data(
-            name="x", shape=[32], dtype='float32')
-        input_y = paddle.fluid.layers.data(name="y", shape=[1], dtype='int64')
-
-        fc = paddle.fluid.layers.fc(input=input_x, size=64, act='tanh')
-        prediction = paddle.fluid.layers.fc(input=[fc], size=2, act='softmax')
-        cost = paddle.fluid.layers.cross_entropy(
-            input=prediction, label=input_y)
-        avg_cost = paddle.fluid.layers.mean(x=cost)
-
-        strategy = paddle.distributed.fleet.DistributedStrategy()
-        strategy.localsgd = True
-        strategy.auto = True
-        config = strategy.localsgd_configs
-        config['k_steps'] = 1
-        config['begin_step'] = 1
-        strategy.localsgd_configs = config
-
-        optimizer = paddle.fluid.optimizer.SGD(learning_rate=0.01)
-        optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy)
-        optimizer.minimize(avg_cost)
-
-
-class TestFleetAdaptiveLocalSGDMetaOptimizer(unittest.TestCase):
-    def setUp(self):
-        os.environ["PADDLE_TRAINER_ID"] = "1"
-        os.environ[
-            "PADDLE_TRAINER_ENDPOINTS"] = "127.0.0.1:36001,127.0.0.1:36002"
-
+        train_prog, startup_prog = fluid.Program(), fluid.Program()
+        avg_cost, strategy = self.net(train_prog, startup_prog)
+        self.set_strategy(strategy, 'localsgd')
+        self.optimizer(avg_cost, strategy, train_prog, startup_prog)
+
+        ops = [op.type for op in avg_cost.block.ops]
+        outs = [
+            ''.join(op.output('Out')) for op in avg_cost.block.ops
+            if op.type == 'conditional_block'
+        ]
+
+        self.assertIn('conditional_block', ops)
+        self.assertIn('@SNAPSHOT', ''.join(outs))
+
+    def test_localsgd_amp_optimizer(self):
+        train_prog, startup_prog = fluid.Program(), fluid.Program()
+        avg_cost, strategy = self.net(train_prog, startup_prog)
+        self.set_strategy(strategy, 'localsgd')
+        self.set_strategy(strategy, 'amp')
+        self.optimizer(avg_cost, strategy, train_prog, startup_prog)
+
+        ops = [op.type for op in avg_cost.block.ops]
+        outs = [
+            ''.join(op.output('Out')) for op in avg_cost.block.ops
+            if op.type == 'conditional_block'
+        ]
+
+        self.assertIn('conditional_block', ops)
+        self.assertIn('@SNAPSHOT', ''.join(outs))
+
+        # amp
+        self.assertIn('cast', ops)
+        self.assertIn('check_finite_and_unscale', ops)
+
+
+class TestFleetAdaptiveLocalSGDMetaOptimizer(TestFleetMetaOptimizer):
     def test_adaptive_localsgd_optimizer(self):
-        role = role_maker.PaddleCloudRoleMaker(is_collective=True)
-        fleet.init(role)
-        input_x = paddle.fluid.layers.data(
-            name="x", shape=[32], dtype='float32')
-        input_y = paddle.fluid.layers.data(name="y", shape=[1], dtype='int64')
-
-        fc = paddle.fluid.layers.fc(input=input_x, size=64, act='tanh')
-        prediction = paddle.fluid.layers.fc(input=[fc], size=2, act='softmax')
-        cost = paddle.fluid.layers.cross_entropy(
-            input=prediction, label=input_y)
-        avg_cost = paddle.fluid.layers.mean(x=cost)
-
-        strategy = paddle.distributed.fleet.DistributedStrategy()
-        strategy.adaptive_localsgd = True
-        config = strategy.adaptive_localsgd_configs
-        config['init_k_steps'] = 1
-        config['begin_step'] = 1
-        strategy.adaptive_localsgd_configs = config
-
-        optimizer = paddle.fluid.optimizer.SGD(learning_rate=0.01)
-        optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy)
-        optimizer.minimize(avg_cost)
+        train_prog, startup_prog = fluid.Program(), fluid.Program()
+        avg_cost, strategy = self.net(train_prog, startup_prog)
+        self.set_strategy(strategy, 'adaptive_localsgd')
+        self.optimizer(avg_cost, strategy, train_prog, startup_prog)
+
+        ops = [op.type for op in avg_cost.block.ops]
+        outs = [
+            ''.join(op.output('Out')) for op in avg_cost.block.ops
+            if op.type == 'conditional_block'
+        ]
+
+        self.assertIn('conditional_block', ops)
+        self.assertIn('@SNAPSHOT', ''.join(outs))
+
+    def test_localsgd_amp_optimizer(self):
+        train_prog, startup_prog = fluid.Program(), fluid.Program()
+        avg_cost, strategy = self.net(train_prog, startup_prog)
+        self.set_strategy(strategy, 'adaptive_localsgd')
+        self.set_strategy(strategy, 'amp')
+        self.optimizer(avg_cost, strategy, train_prog, startup_prog)
+
+        ops = [op.type for op in avg_cost.block.ops]
+        outs = [
+            ''.join(op.output('Out')) for op in avg_cost.block.ops
+            if op.type == 'conditional_block'
+        ]
+
+        self.assertIn('conditional_block', ops)
+        self.assertIn('@SNAPSHOT', ''.join(outs))
+
+        # amp
+        self.assertIn('cast', ops)
+        self.assertIn('check_finite_and_unscale', ops)
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/test_fleet_recompute_meta_optimizer.py b/python/paddle/fluid/tests/unittests/test_fleet_recompute_meta_optimizer.py
index a42010a4eaa50..42b60cd3fad5a 100644
--- a/python/paddle/fluid/tests/unittests/test_fleet_recompute_meta_optimizer.py
+++ b/python/paddle/fluid/tests/unittests/test_fleet_recompute_meta_optimizer.py
@@ -14,40 +14,144 @@
 
 import unittest
 import paddle
+import paddle.fluid as fluid
 import os
+from fleet_meta_optimizer_base import TestFleetMetaOptimizer
+from paddle.distributed.fleet.meta_optimizers import RecomputeOptimizer
 
+paddle.enable_static()
 
-class TestFleetRecomputeMetaOptimizer(unittest.TestCase):
-    def setUp(self):
-        os.environ["POD_IP"] = "127.0.0.1"
-        os.environ["PADDLE_TRAINER_ENDPOINTS"] = "127.0.0.1:36001"
-        os.environ["PADDLE_TRAINERS_NUM"] = "2"
-        os.environ["PADDLE_PSERVERS_IP_PORT_LIST"] = \
-                       "127.0.0.1:36001,127.0.0.2:36001"
+
+class TestFleetRecomputeMetaOptimizer(TestFleetMetaOptimizer):
+    def test_recompute_optimizer_backward(self):
+        """ test recompute optimizer backward """
+        train_prog, startup_prog = fluid.Program(), fluid.Program()
+        avg_cost, strategy = self.net(train_prog, startup_prog)
+
+        self.set_strategy(strategy, 'recompute')
+        opt = fluid.optimizer.MomentumOptimizer(
+            learning_rate=0.001, momentum=0.9)
+        opt = RecomputeOptimizer(opt)
+        opt.user_defined_strategy = strategy
+        params_grads = opt.backward(avg_cost, startup_prog)
+
+        outs = [
+            op.output('Out')[0] for op in avg_cost.block.ops if op.type == 'mul'
+        ]
+        self.assertIn('subprog', ''.join(outs))
+
+    def test_recompute_optimizer_backward_gradients(self):
+        """ test recompute optimizer backward + gradients """
+        train_prog, startup_prog = fluid.Program(), fluid.Program()
+        avg_cost, strategy = self.net(train_prog, startup_prog)
+
+        self.set_strategy(strategy, 'recompute')
+        opt = fluid.optimizer.MomentumOptimizer(
+            learning_rate=0.001, momentum=0.9)
+        opt = RecomputeOptimizer(opt)
+        opt.user_defined_strategy = strategy
+        params_grads = opt.backward(avg_cost, startup_prog)
+        with fluid.program_guard(train_prog, startup_prog):
+            opt.apply_gradients(params_grads)
+
+        outs = [
+            op.output('Out')[0] for op in avg_cost.block.ops if op.type == 'mul'
+        ]
+        self.assertIn('subprog', ''.join(outs))
+
+    def test_recompute_optimizer_backward_optimize(self):
+        """ test recompute optimizer backward + optimize """
+        train_prog, startup_prog = fluid.Program(), fluid.Program()
+        avg_cost, strategy = self.net(train_prog, startup_prog)
+
+        self.set_strategy(strategy, 'recompute')
+        opt = fluid.optimizer.MomentumOptimizer(
+            learning_rate=0.001, momentum=0.9)
+        opt = RecomputeOptimizer(opt)
+        opt.user_defined_strategy = strategy
+        params_grads = opt.backward(avg_cost, startup_prog)
+        opt.apply_optimize(avg_cost, startup_prog, params_grads)
+
+        outs = [
+            op.output('Out')[0] for op in avg_cost.block.ops if op.type == 'mul'
+        ]
+        self.assertIn('subprog', ''.join(outs))
+
+    def test_recompute_optimizer_backward(self):
+        """ test recompute optimizer backward """
+        train_prog, startup_prog = fluid.Program(), fluid.Program()
+        avg_cost, strategy = self.net(train_prog, startup_prog)
+
+        self.set_strategy(strategy, 'recompute')
+        opt = fluid.optimizer.MomentumOptimizer(
+            learning_rate=0.001, momentum=0.9)
+        opt = RecomputeOptimizer(opt)
+        opt.user_defined_strategy = strategy
+        params_grads = opt.backward(avg_cost, startup_prog)
+
+        outs = [
+            op.output('Out')[0] for op in avg_cost.block.ops if op.type == 'mul'
+        ]
+        self.assertIn('subprog', ''.join(outs))
+
+    def test_recompute_optimizer_backward(self):
+        """ test recompute optimizer backward """
+        train_prog, startup_prog = fluid.Program(), fluid.Program()
+        avg_cost, strategy = self.net(train_prog, startup_prog)
+
+        self.set_strategy(strategy, 'recompute')
+        opt = fluid.optimizer.MomentumOptimizer(
+            learning_rate=0.001, momentum=0.9)
+        opt = RecomputeOptimizer(opt)
+        opt.user_defined_strategy = strategy
+        params_grads = opt.backward(avg_cost, startup_prog)
+
+        outs = [
+            op.output('Out')[0] for op in avg_cost.block.ops if op.type == 'mul'
+        ]
+        self.assertIn('subprog', ''.join(outs))
 
     def test_recompute_optimizer(self):
-        import paddle.distributed.fleet as fleet
-        import paddle.distributed.fleet.base.role_maker as role_maker
-        role = role_maker.PaddleCloudRoleMaker(is_collective=True)
-        fleet.init(role)
-        input_x = paddle.fluid.layers.data(
-            name="x", shape=[32], dtype='float32')
-        input_y = paddle.fluid.layers.data(name="y", shape=[1], dtype='int64')
-
-        fc_1 = paddle.fluid.layers.fc(input=input_x, size=64, act='tanh')
-        fc_2 = paddle.fluid.layers.fc(input=fc_1, size=64, act='tanh')
-        prediction = paddle.fluid.layers.fc(input=[fc_2], size=2, act='softmax')
-        cost = paddle.fluid.layers.cross_entropy(
-            input=prediction, label=input_y)
-        avg_cost = paddle.fluid.layers.mean(x=cost)
-
-        strategy = paddle.distributed.fleet.DistributedStrategy()
-        strategy.recompute = True
-        strategy.recompute_configs = {"checkpoints": ["fc_1.tmp_0"]}
-
-        optimizer = paddle.fluid.optimizer.SGD(learning_rate=0.01)
-        optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy)
-        optimizer.minimize(avg_cost)
+        train_prog, startup_prog = fluid.Program(), fluid.Program()
+        avg_cost, strategy = self.net(train_prog, startup_prog)
+        self.set_strategy(strategy, 'recompute')
+        self.optimizer(avg_cost, strategy, train_prog, startup_prog)
+
+        outs = [
+            op.output('Out')[0] for op in avg_cost.block.ops if op.type == 'mul'
+        ]
+
+        self.assertIn('subprog', ''.join(outs))
+
+    def test_recompute_lars_optimizer(self):
+        train_prog, startup_prog = fluid.Program(), fluid.Program()
+        avg_cost, strategy = self.net(train_prog, startup_prog)
+        self.set_strategy(strategy, 'recompute')
+        self.set_strategy(strategy, 'lars')
+        self.optimizer(avg_cost, strategy, train_prog, startup_prog)
+
+        ops = [op.type for op in avg_cost.block.ops]
+        outs = [
+            op.output('Out')[0] for op in avg_cost.block.ops if op.type == 'mul'
+        ]
+
+        self.assertIn('subprog', ''.join(outs))
+        self.assertIn('lars_momentum', ops)
+
+    def test_recompute_lamb_optimizer(self):
+        train_prog, startup_prog = fluid.Program(), fluid.Program()
+        avg_cost, strategy = self.net(train_prog, startup_prog)
+        self.set_strategy(strategy, 'recompute')
+        self.set_strategy(strategy, 'lamb')
+        self.optimizer(avg_cost, strategy, train_prog, startup_prog, 'adam')
+
+        ops = [op.type for op in avg_cost.block.ops]
+        outs = [
+            op.output('Out')[0] for op in avg_cost.block.ops if op.type == 'mul'
+        ]
+
+        self.assertIn('subprog', ''.join(outs))
+        self.assertIn('lamb', ops)
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/test_hsigmoid.py b/python/paddle/fluid/tests/unittests/test_hsigmoid.py
deleted file mode 100644
index 80937640c2d2f..0000000000000
--- a/python/paddle/fluid/tests/unittests/test_hsigmoid.py
+++ /dev/null
@@ -1,219 +0,0 @@
-# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from paddle import fluid, nn
-import paddle.fluid.dygraph as dg
-import paddle.nn.functional as F
-import paddle.fluid.initializer as I
-import numpy as np
-import unittest
-
-
-class HSigmoidTestCase(unittest.TestCase):
-    def __init__(self,
-                 methodName="runTest",
-                 batch_size=4,
-                 feature_size=6,
-                 num_classes=8,
-                 labels=None,
-                 path_code=None,
-                 path_table=None,
-                 is_sparse=False,
-                 dtype="float32"):
-        super(HSigmoidTestCase, self).__init__()
-        self.batch_size = batch_size
-        self.feature_size = feature_size
-        self.num_classes = num_classes
-        self.dtype = dtype
-        self.is_sparse = is_sparse
-
-        self.labels = labels
-        self.path_code = path_code
-        self.path_table = path_table
-        self.is_custom = path_code is not None and path_table is not None
-
-    def setUp(self):
-        input_shape = (self.batch_size, self.feature_size)
-        self.input = np.random.uniform(
-            -1, 1, size=input_shape).astype(self.dtype)
-        if self.labels is None:
-            self.labels = np.random.randint(
-                0, self.num_classes, size=(self.batch_size, 1)).astype(np.int64)
-        C = self.num_classes if self.is_custom else self.num_classes - 1
-        self.weight_shape = (C, self.feature_size)
-        self.weight = np.random.randn(*self.weight_shape).astype(self.dtype)
-        self.bias_shape = (C, 1)
-        self.bias = np.random.randn(*self.bias_shape).astype(self.dtype)
-
-    def fluid_layer(self, place):
-        main = fluid.Program()
-        start = fluid.Program()
-        with fluid.unique_name.guard():
-            with fluid.program_guard(main, start):
-                x = fluid.data(
-                    "input", [-1, self.feature_size], dtype=self.dtype)
-                label = fluid.data("labels", [-1, 1], dtype="int64")
-                if self.is_custom:
-                    path_table = fluid.data(
-                        "path_table", [-1, -1], dtype="int64")
-                    path_code = fluid.data("path_code", [-1, -1], dtype="int64")
-                else:
-                    path_table = path_code = None
-                y = fluid.layers.hsigmoid(
-                    x,
-                    label,
-                    self.num_classes,
-                    param_attr=I.NumpyArrayInitializer(self.weight),
-                    bias_attr=I.NumpyArrayInitializer(self.bias),
-                    path_table=path_table,
-                    path_code=path_code,
-                    is_custom=self.is_custom,
-                    is_sparse=self.is_sparse, )
-        exe = fluid.Executor(place)
-        exe.run(start)
-        feed_dict = {"input": self.input, "labels": self.labels}
-        if self.is_custom:
-            feed_dict["path_code"] = self.path_code
-            feed_dict["path_table"] = self.path_table
-        y_np, = exe.run(main, feed=feed_dict, fetch_list=[y])
-        return y_np
-
-    def functional(self, place):
-        main = fluid.Program()
-        start = fluid.Program()
-        with fluid.unique_name.guard():
-            with fluid.program_guard(main, start):
-                x = fluid.data(
-                    "input", [-1, self.feature_size], dtype=self.dtype)
-                label = fluid.data("labels", [-1, 1], dtype="int64")
-                if self.is_custom:
-                    path_table = fluid.data(
-                        "path_table", [-1, -1], dtype="int64")
-                    path_code = fluid.data("path_code", [-1, -1], dtype="int64")
-                else:
-                    path_table = path_code = None
-                w = fluid.data("weight", self.weight_shape, dtype=self.dtype)
-                b = fluid.data("bias", self.bias_shape, dtype=self.dtype)
-                y = F.hsigmoid(
-                    x,
-                    label,
-                    w,
-                    b,
-                    self.num_classes,
-                    is_sparse=self.is_sparse,
-                    path_table=path_table,
-                    path_code=path_code)
-
-        exe = fluid.Executor(place)
-        exe.run(start)
-        feed_dict = {
-            "input": self.input,
-            "labels": self.labels,
-            "weight": self.weight,
-            "bias": self.bias
-        }
-        if self.is_custom:
-            feed_dict["path_code"] = self.path_code
-            feed_dict["path_table"] = self.path_table
-        y_np, = exe.run(main, feed=feed_dict, fetch_list=[y])
-        return y_np
-
-    def nn_layer(self, place):
-        with dg.guard(place):
-            x_var = dg.to_variable(self.input)
-            label_var = dg.to_variable(self.labels)
-            if self.is_custom:
-                path_code_var = dg.to_variable(self.path_code)
-                path_table_var = dg.to_variable(self.path_table)
-            else:
-                path_code_var = path_table_var = None
-            hierarchical_softmax = nn.HSigmoid(
-                self.feature_size,
-                self.num_classes,
-                is_custom=self.is_custom,
-                is_sparse=self.is_sparse,
-                param_attr=I.NumpyArrayInitializer(self.weight),
-                bias_attr=I.NumpyArrayInitializer(self.bias),
-                dtype=self.dtype)
-            y_var = hierarchical_softmax(
-                x_var,
-                label_var,
-                path_table=path_table_var,
-                path_code=path_code_var)
-            y_np = y_var.numpy()
-        return y_np
-
-    def _test_equivalence(self, place):
-        result1 = self.fluid_layer(place)
-        result2 = self.functional(place)
-        result3 = self.nn_layer(place)
-        np.testing.assert_array_almost_equal(result1, result2)
-        np.testing.assert_array_almost_equal(result2, result3)
-
-    def runTest(self):
-        place = fluid.CPUPlace()
-        self._test_equivalence(place)
-
-
-class HSigmoidTestErrorCase(HSigmoidTestCase):
-    def runTest(self):
-        place = fluid.CPUPlace()
-        with dg.guard(place):
-            with self.assertRaises(ValueError):
-                self.nn_layer()
-
-    def nn_layer(self):
-        x_var = dg.to_variable(self.input)
-        label_var = dg.to_variable(self.labels)
-        if self.is_custom:
-            path_code_var = dg.to_variable(self.path_code)
-            path_table_var = dg.to_variable(self.path_table)
-        else:
-            path_code_var = path_table_var = None
-        hierarchical_softmax = nn.HSigmoid(
-            self.feature_size,
-            self.num_classes,
-            is_custom=self.is_custom,
-            param_attr=I.NumpyArrayInitializer(self.weight),
-            bias_attr=I.NumpyArrayInitializer(self.bias),
-            dtype=self.dtype)
-        y_var = hierarchical_softmax(
-            x_var,
-            label_var,
-            path_table=path_table_var,
-            path_code=path_code_var)
-        y_np = y_var.numpy()
-        return y_np
-
-
-def load_tests(loader, standard_tests, pattern):
-    suite = unittest.TestSuite()
-    suite.addTest(HSigmoidTestCase(methodName="runTest"))
-    suite.addTest(
-        HSigmoidTestCase(
-            methodName="runTest",
-            batch_size=4,
-            feature_size=6,
-            num_classes=8,
-            labels=np.array([0, 1, 4, 5]).astype(np.int64),
-            path_table=np.array([(0, 2, -1, -1, -1), (0, 1, 3, -1, -1), (
-                0, 1, 4, -1, -1), (0, 2, -1, -1, -1)]).astype(np.int64),
-            path_code=np.array([(0, 0, -1, -1, -1), (1, 1, 1, -1, -1), (
-                1, 0, 0, -1, -1), (0, 1, -1, -1, -1)]).astype(np.int64)))
-    suite.addTest(HSigmoidTestErrorCase(methodName="runTest", num_classes=1))
-    return suite
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_hsigmoid_op.py b/python/paddle/fluid/tests/unittests/test_hsigmoid_op.py
index 5c9867e681524..3f8eed08adf68 100644
--- a/python/paddle/fluid/tests/unittests/test_hsigmoid_op.py
+++ b/python/paddle/fluid/tests/unittests/test_hsigmoid_op.py
@@ -19,10 +19,13 @@
 import paddle
 import paddle.fluid.core as core
 import paddle.fluid as fluid
+import paddle.nn.functional as F
 from paddle.fluid import Program, program_guard
+import paddle.fluid.initializer as I
 import math
 from op_test import OpTest, skip_check_grad_ci
 
+paddle.enable_static()
 np.random.seed(100)
 
 
@@ -56,7 +59,6 @@ def cal_index(self, bit):
     def get_length(self):
         length = 0
         for ele in self.ptable_[self.index_]:  # find the first -1 to stop trace
-
             if ele >= 0:
                 length = length + 1
             else:
@@ -388,8 +390,192 @@ def test_check_grad(self):
         self.check_grad(['X', 'W'], ['Out'], no_grad_set=set('Label'))
 
 
-class TestHSigmoidOpError(unittest.TestCase):
+class TestHSigmoidLossAPI(unittest.TestCase):
+    # test paddle.nn.functional.hsigmoid_loss, paddle.nn.HSigmoidLoss
+    def setUp(self):
+        self.dtype = 'float32'
+        self.batch_size = 4
+        self.feature_size = 6
+        self.num_classes = 8
+        self.is_custom = False
+        self.place = paddle.CPUPlace()
+
+        paddle.set_default_dtype(self.dtype)
+
+        self.x_np = np.random.uniform(
+            -1, 1, [self.batch_size, self.feature_size]).astype(self.dtype)
+        self.labels_np = np.random.randint(
+            self.num_classes, size=(self.batch_size, 1), dtype='int64')
+        self.weight_np = np.random.uniform(
+            -1, 1, [self.num_classes - 1, self.feature_size]).astype(self.dtype)
+        self.bias_np = np.random.uniform(-1, 1, (
+            self.num_classes - 1, )).astype(self.dtype)
+        self.path_table_np = None
+        self.path_code_np = None
+        _, self.out_np = hsigmoid(self.x_np, self.weight_np, self.labels_np,
+                                  self.bias_np, self.num_classes)
+        self.set_attrs()
+
+        if self.is_custom:
+            _, self.out_np = hsigmoidWithCustomTree(
+                self.x_np, self.weight_np, self.path_table_np,
+                self.path_code_np, self.labels_np,
+                self.bias_np.reshape(-1, 1), self.num_classes)
+
+    def set_attrs(self):
+        pass
+
+    def test_dygraph_api(self):
+        paddle.disable_static(self.place)
+        x = paddle.to_tensor(self.x_np)
+        labels = paddle.to_tensor(self.labels_np)
+        weight = paddle.to_tensor(self.weight_np)
+        bias = paddle.to_tensor(self.bias_np)
+        path_table = None
+        path_code = None
+        if self.is_custom:
+            path_table = paddle.to_tensor(self.path_table_np)
+            path_code = paddle.to_tensor(self.path_code_np)
+        out1 = F.hsigmoid_loss(x, labels, self.num_classes, weight, bias,
+                               path_table, path_code)
+
+        weight_attr = I.NumpyArrayInitializer(self.weight_np)
+        bias_attr = I.NumpyArrayInitializer(self.bias_np)
+        m = paddle.nn.HSigmoidLoss(self.feature_size, self.num_classes,
+                                   weight_attr, bias_attr, self.is_custom)
+        out2 = m(x, labels, path_table, path_code)
+
+        for out in [out1, out2]:
+            self.assertTrue(np.allclose(self.out_np, out.numpy()))
+        paddle.enable_static()
+
+    def test_static_api(self):
+        train_program = paddle.static.Program()
+        startup_program = paddle.static.Program()
+        with paddle.static.program_guard(train_program, startup_program):
+            x = paddle.static.data('x', [-1, self.feature_size])
+            labels = paddle.static.data('labels', [-1, 1], 'int64')
+            weight = paddle.static.data('weight', [-1, self.feature_size])
+            bias = paddle.static.data('bias', [-1, ])
+            path_table = None
+            path_code = None
+            if self.is_custom:
+                path_table = paddle.static.data('path_table', [-1, -1], 'int64')
+                path_code = paddle.static.data('path_code', [-1, -1], 'int64')
+            out1 = F.hsigmoid_loss(x, labels, self.num_classes, weight, bias,
+                                   path_table, path_code)
+
+            weight_attr = paddle.framework.ParamAttr(
+                initializer=I.NumpyArrayInitializer(self.weight_np))
+            bias_attr = paddle.framework.ParamAttr(
+                initializer=I.NumpyArrayInitializer(self.bias_np))
+            m = paddle.nn.HSigmoidLoss(self.feature_size, self.num_classes,
+                                       weight_attr, bias_attr, self.is_custom)
+            out2 = m(x, labels, path_table, path_code)
+
+            exe = paddle.static.Executor(self.place)
+            exe.run(startup_program)
+            feed_dict = {
+                'x': self.x_np,
+                'labels': self.labels_np,
+                'weight': self.weight_np,
+                'bias': self.bias_np
+            }
+            if self.is_custom:
+                feed_dict["path_code"] = self.path_code_np
+                feed_dict["path_table"] = self.path_table_np
+            ret1, ret2 = exe.run(train_program,
+                                 feed=feed_dict,
+                                 fetch_list=[out1, out2])
+
+            for ret in [ret1, ret2]:
+                self.assertTrue(np.allclose(self.out_np, ret))
+
+    def test_fluid_api(self):
+        train_program = fluid.Program()
+        startup_program = fluid.Program()
+        with fluid.program_guard(train_program, startup_program):
+            x = fluid.data('x', [-1, self.feature_size])
+            labels = fluid.data('labels', [-1, 1], 'int64')
+            path_table = None
+            path_code = None
+            if self.is_custom:
+                path_table = fluid.data('path_table', [-1, -1], 'int64')
+                path_code = fluid.data('path_code', [-1, -1], 'int64')
+            weight_attr = I.NumpyArrayInitializer(self.weight_np)
+            bias_attr = I.NumpyArrayInitializer(self.bias_np)
+            out = fluid.layers.hsigmoid(x, labels, self.num_classes,
+                                        weight_attr, bias_attr, 'out',
+                                        path_table, path_code, self.is_custom)
+
+            exe = fluid.Executor(self.place)
+            exe.run(startup_program)
+            feed_dict = {'x': self.x_np, 'labels': self.labels_np}
+            if self.is_custom:
+                feed_dict["path_code"] = self.path_code_np
+                feed_dict["path_table"] = self.path_table_np
+            ret, = exe.run(train_program, feed=feed_dict, fetch_list=[out])
+
+            self.assertTrue(np.allclose(ret, self.out_np))
+
     def test_errors(self):
+        with paddle.static.program_guard(paddle.static.Program(),
+                                         paddle.static.Program()):
+            # test paddle.nn.HSigmoidLoss
+            self.assertRaises(ValueError, paddle.nn.HSigmoidLoss, 6, 1)
+
+            # test paddle.nn.functional.hsigmoid_loss
+            x = paddle.static.data('x', [4, 6])
+            label = paddle.static.data('label', [4, 1], 'int64')
+            weight = paddle.static.data('weight', [7, 6])
+            bias = paddle.static.data('bias', [7])
+
+            x_int32 = paddle.static.data('x_int32', [4, 6], 'int32')
+            self.assertRaises(TypeError, F.hsigmoid_loss, x_int32, label, 8,
+                              weight)
+
+            label_float32 = paddle.static.data('label_float32', [4, 1],
+                                               'float32')
+            self.assertRaises(TypeError, F.hsigmoid_loss, x, label_float32, 8,
+                              weight)
+
+            weight_int32 = paddle.static.data('weight_int32', [7, 6], 'int32')
+            self.assertRaises(TypeError, F.hsigmoid_loss, x, label, 8,
+                              weight_int32)
+
+            bias_int32 = paddle.static.data('bias_int32', [7], 'int32')
+            self.assertRaises(
+                TypeError,
+                F.hsigmoid_loss,
+                x,
+                label,
+                8,
+                weight,
+                bias=bias_int32)
+
+            path_table_int32 = paddle.static.data('path_table_int32', [7],
+                                                  'int32')
+            self.assertRaises(
+                TypeError,
+                F.hsigmoid_loss,
+                x,
+                label,
+                8,
+                weight,
+                path_table=path_table_int32)
+
+            path_code_int32 = paddle.static.data('path_code_int32', [7],
+                                                 'int32')
+            self.assertRaises(
+                TypeError,
+                F.hsigmoid_loss,
+                x,
+                label,
+                8,
+                weight,
+                path_code=path_code_int32)
+
+        # test paddle.fluid.layers.hsigmoid
         with program_guard(Program()):
             label = fluid.data('label', [4, 1], 'int64')
             # The input type must be Variable.
@@ -410,5 +596,17 @@ def test_errors(self):
                               label_int32, 2)
 
 
+class TestHSigmoidLossAPICustom(TestHSigmoidLossAPI):
+    def set_attrs(self):
+        self.is_custom = True
+        self.path_table_np = np.array([(0, 2, -1, -1, -1), (0, 1, 3, -1, -1), (
+            0, 1, 4, -1, -1), (0, 2, -1, -1, -1)]).astype(np.int64)
+        self.path_code_np = np.array([(0, 0, -1, -1, -1), (1, 1, 1, -1, -1), (
+            1, 0, 0, -1, -1), (0, 1, -1, -1, -1)]).astype(np.int64)
+
+    def test_errors(self):
+        pass
+
+
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_save_load.py b/python/paddle/fluid/tests/unittests/test_imperative_save_load.py
index bee53fd10f5fe..45709a358635c 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_save_load.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_save_load.py
@@ -917,11 +917,6 @@ def test_load_compatible_with_keep_name_table(self):
             state_dict = emb.state_dict()
             fluid.save_dygraph(state_dict, os.path.join('saved_dy', 'emb_dy'))
 
-            para_state_dict, opti_state_dict = fluid.load_dygraph(
-                os.path.join('saved_dy', 'emb_dy'), True)
-            self.assertTrue(para_state_dict != None)
-            self.assertTrue(opti_state_dict == None)
-
             para_state_dict, opti_state_dict = fluid.load_dygraph(
                 os.path.join('saved_dy', 'emb_dy'), keep_name_table=True)
             self.assertTrue(para_state_dict != None)
diff --git a/python/paddle/fluid/tests/unittests/test_initializer_nn.py b/python/paddle/fluid/tests/unittests/test_initializer_nn.py
new file mode 100644
index 0000000000000..6ad19658fd203
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_initializer_nn.py
@@ -0,0 +1,108 @@
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import numpy as np
+import unittest
+
+import paddle
+import paddle.nn as nn
+import paddle.fluid as fluid
+import paddle.fluid.framework as framework
+import paddle.nn.initializer as initializer
+from paddle.fluid.core import VarDesc
+
+DELTA = 0.00001
+
+
+def check_cast_op(op):
+    return op.type == 'cast' and \
+           op.attr('in_dtype') == VarDesc.VarType.FP32 and \
+           op.attr('out_dtype') == VarDesc.VarType.FP16
+
+
+class TestConstantInitializer(unittest.TestCase):
+    def static_test_constant_initializer_common(self,
+                                                init_inst,
+                                                dtype="float32",
+                                                value_target=0.0):
+        paddle.enable_static()
+        program = framework.Program()
+        block = program.global_block()
+        for _ in range(2):
+            block.create_parameter(
+                dtype=dtype,
+                shape=[5, 10],
+                lod_level=0,
+                name="param",
+                initializer=init_inst)
+        num_ops = 2 if dtype == "float16" else 1
+        self.assertEqual(len(block.ops), num_ops)
+        init_op = block.ops[0]
+        self.assertEqual(init_op.type, 'fill_constant')
+        self.assertAlmostEqual(init_op.attr('value'), value_target, delta=DELTA)
+        paddle.disable_static()
+        return block
+
+    def test_constant_initializer_default_value_static(self, dtype="float32"):
+        """Test the constant initializer with default value in static graph
+        """
+        block = self.static_test_constant_initializer_common(
+            init_inst=initializer.Constant(), dtype=dtype, value_target=0.0)
+        return block
+
+    def test_constant_initializer_default_value_dygraph(self, dtype="float32"):
+        """Test constant initializer with supplied value in dygraph
+        """
+        with fluid.dygraph.guard():
+            linear = nn.Linear(2, 4, weight_attr=nn.initializer.Constant())
+            mat_target = np.ones((2, 4), dtype=dtype) * 0.0
+            mat_linear = linear.weight.numpy()
+            mismatch = np.sum(
+                (mat_target - mat_linear) * (mat_target - mat_linear))
+            self.assertAlmostEqual(mismatch, 0.0, delta=DELTA)
+
+    def test_constant_initializer_static(self, dtype="float32"):
+        """Test constant initializer with supplied value in static graph
+        """
+        block = self.static_test_constant_initializer_common(
+            init_inst=initializer.Constant(2.3), dtype=dtype, value_target=2.3)
+        return block
+
+    def test_constant_initializer_dygraph(self, dtype="float32"):
+        """Test constant initializer with supplied value in dygraph
+        """
+        with fluid.dygraph.guard():
+            linear = nn.Linear(
+                2, 4, weight_attr=nn.initializer.Constant(value=2.0))
+            mat_target = np.ones((2, 4), dtype=dtype) * 2.0
+            mat_linear = linear.weight.numpy()
+            mismatch = np.sum(
+                (mat_target - mat_linear) * (mat_target - mat_linear))
+            self.assertAlmostEqual(mismatch, 0.0, delta=DELTA)
+
+    def test_constant_initializer_fp16(self):
+        """Test constant initializer with float16
+        """
+        block = self.test_constant_initializer_default_value_static("float16")
+        self.assertTrue(check_cast_op(block.ops[1]))
+        block = self.test_constant_initializer_static("float16")
+        self.assertTrue(check_cast_op(block.ops[1]))
+        self.test_constant_initializer_default_value_dygraph("float16")
+        self.test_constant_initializer_dygraph("float16")
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_jit_save_load.py b/python/paddle/fluid/tests/unittests/test_jit_save_load.py
index 9940424618504..71ec1271a041e 100644
--- a/python/paddle/fluid/tests/unittests/test_jit_save_load.py
+++ b/python/paddle/fluid/tests/unittests/test_jit_save_load.py
@@ -23,7 +23,7 @@
 import paddle.fluid as fluid
 from paddle.fluid.dygraph import Linear
 from paddle.fluid.dygraph import declarative, ProgramTranslator
-from paddle.fluid.dygraph.io import EXTRA_VAR_INFO_FILENAME, VARIABLE_FILENAME
+from paddle.fluid.dygraph.io import INFER_MODEL_SUFFIX, INFER_PARAMS_SUFFIX, INFER_PARAMS_INFO_SUFFIX
 
 BATCH_SIZE = 32
 BATCH_NUM = 10
@@ -127,8 +127,8 @@ class MultiLoadingLinearNet(fluid.dygraph.Layer):
     def __init__(self, size, model_path):
         super(MultiLoadingLinearNet, self).__init__()
         self._linear = Linear(size, size)
-        self._load_linear1 = fluid.dygraph.jit.load(model_path)
-        self._load_linear2 = fluid.dygraph.jit.load(model_path)
+        self._load_linear1 = paddle.jit.load(model_path)
+        self._load_linear2 = paddle.jit.load(model_path)
 
     @declarative
     def forward(self, x):
@@ -218,23 +218,20 @@ def train_with_label(layer, input_size=784, label_size=1):
 
 class TestJitSaveLoad(unittest.TestCase):
     def setUp(self):
-        self.model_path = "model.test_jit_save_load"
+        self.model_path = "test_jit_save_load/model"
         # enable dygraph mode
         fluid.enable_dygraph()
         # config seed
         paddle.manual_seed(SEED)
         paddle.framework.random._manual_program_seed(SEED)
 
-    def train_and_save_model(self, model_path=None, configs=None):
+    def train_and_save_model(self, model_path=None):
         layer = LinearNet(784, 1)
         example_inputs, layer, _ = train(layer)
         final_model_path = model_path if model_path else self.model_path
         orig_input_types = [type(x) for x in example_inputs]
-        fluid.dygraph.jit.save(
-            layer=layer,
-            model_path=final_model_path,
-            input_spec=example_inputs,
-            configs=configs)
+        paddle.jit.save(
+            layer=layer, path=final_model_path, input_spec=example_inputs)
         new_input_types = [type(x) for x in example_inputs]
         self.assertEqual(orig_input_types, new_input_types)
         return layer
@@ -243,13 +240,10 @@ def test_save_load(self):
         # train and save model
         train_layer = self.train_and_save_model()
         # load model
-        program_translator = ProgramTranslator()
-        program_translator.enable(False)
-        loaded_layer = fluid.dygraph.jit.load(self.model_path)
+        loaded_layer = paddle.jit.load(self.model_path)
         self.load_and_inference(train_layer, loaded_layer)
         self.load_dygraph_state_dict(train_layer)
         self.load_and_finetune(train_layer, loaded_layer)
-        program_translator.enable(True)
 
     def load_and_inference(self, train_layer, infer_layer):
         train_layer.eval()
@@ -274,7 +268,7 @@ def load_dygraph_state_dict(self, train_layer):
         # construct new model
         new_layer = LinearNet(784, 1)
         orig_state_dict = new_layer.state_dict()
-        load_state_dict, _ = fluid.dygraph.load_dygraph(self.model_path)
+        load_state_dict = paddle.load(self.model_path)
         for structured_name in orig_state_dict:
             self.assertTrue(structured_name in load_state_dict)
         new_layer.set_state_dict(load_state_dict)
@@ -286,20 +280,24 @@ def load_dygraph_state_dict(self, train_layer):
             np.array_equal(train_layer(x).numpy(), new_layer(x).numpy()))
 
     def test_load_dygraph_no_path(self):
-        model_path = "model.test_jit_save_load.no_path"
-        new_layer = LinearNet(784, 1)
+        model_path = "test_jit_save_load.no_path/model_path"
         with self.assertRaises(ValueError):
             model_dict, _ = fluid.dygraph.load_dygraph(model_path)
 
     def test_jit_load_model_incomplete(self):
-        model_path = "model.test_jit_save_load.remove_variables"
-        self.train_and_save_model(model_path=model_path)
-        # remove `__variables__`	
-        var_path = os.path.join(model_path, VARIABLE_FILENAME)
+        model_path = "test_jit_save_load.remove_variables/model"
+        self.train_and_save_model(model_path)
+        # remove `.pdiparams`	
+        var_path = model_path + INFER_PARAMS_SUFFIX
         os.remove(var_path)
         with self.assertRaises(ValueError):
             paddle.jit.load(model_path)
 
+    def test_jit_load_no_path(self):
+        path = "test_jit_save_load.no_path/model_path"
+        with self.assertRaises(ValueError):
+            loaded_layer = paddle.jit.load(path)
+
 
 class TestSaveLoadWithInputSpec(unittest.TestCase):
     def setUp(self):
@@ -313,8 +311,7 @@ def test_with_input_spec(self):
             net.forward, input_spec=[InputSpec(
                 [None, 8], name='x')])
 
-        model_path = "model.input_spec.output_spec"
-        configs = fluid.dygraph.jit.SaveLoadConfig()
+        model_path = "input_spec.output_spec/model"
         # check inputs and outputs
         self.assertTrue(len(net.forward.inputs) == 1)
         input_x = net.forward.inputs[0]
@@ -322,11 +319,11 @@ def test_with_input_spec(self):
         self.assertTrue(input_x.name == 'x')
 
         # 1. prune loss
-        configs.output_spec = net.forward.outputs[:1]
-        fluid.dygraph.jit.save(net, model_path, configs=configs)
+        output_spec = net.forward.outputs[:1]
+        paddle.jit.save(net, model_path, output_spec=output_spec)
 
         # 2. load to infer
-        infer_layer = fluid.dygraph.jit.load(model_path, configs=configs)
+        infer_layer = paddle.jit.load(model_path)
         x = fluid.dygraph.to_variable(
             np.random.random((4, 8)).astype('float32'))
         pred = infer_layer(x)
@@ -334,8 +331,7 @@ def test_with_input_spec(self):
     def test_multi_in_out(self):
         net = LinearNetMultiInput(8, 8)
 
-        model_path = "model.multi_inout.output_spec1"
-        configs = fluid.dygraph.jit.SaveLoadConfig()
+        model_path = "multi_inout.output_spec1/model"
         # 1. check inputs and outputs
         self.assertTrue(len(net.forward.inputs) == 2)
         input_x = net.forward.inputs[0]
@@ -344,11 +340,11 @@ def test_multi_in_out(self):
         self.assertTrue(input_y.shape == (-1, 8))
 
         # 2. prune loss
-        configs.output_spec = net.forward.outputs[:2]
-        fluid.dygraph.jit.save(net, model_path, configs=configs)
+        output_spec = net.forward.outputs[:2]
+        paddle.jit.save(net, model_path, output_spec=output_spec)
 
         # 3. load to infer
-        infer_layer = fluid.dygraph.jit.load(model_path, configs=configs)
+        infer_layer = paddle.jit.load(model_path)
         x = fluid.dygraph.to_variable(
             np.random.random((4, 8)).astype('float32'))
         y = fluid.dygraph.to_variable(
@@ -357,11 +353,11 @@ def test_multi_in_out(self):
         pred_x, pred_y = infer_layer(x, y)
 
         # 1. prune y and loss
-        model_path = "model.multi_inout.output_spec2"
-        configs.output_spec = net.forward.outputs[:1]
-        fluid.dygraph.jit.save(net, model_path, [input_x], configs)
+        model_path = "multi_inout.output_spec2/model"
+        output_spec = net.forward.outputs[:1]
+        paddle.jit.save(net, model_path, [input_x], output_spec=output_spec)
         # 2. load again
-        infer_layer2 = fluid.dygraph.jit.load(model_path, configs=configs)
+        infer_layer2 = paddle.jit.load(model_path)
         # 3. predict
         pred_xx = infer_layer2(x)
 
@@ -377,44 +373,6 @@ def setUp(self):
         paddle.manual_seed(SEED)
         paddle.framework.random._manual_program_seed(SEED)
 
-    def basic_save_load(self, layer, model_path, configs):
-        # 1. train & save
-        example_inputs, train_layer, _ = train(layer)
-        fluid.dygraph.jit.save(
-            layer=train_layer,
-            model_path=model_path,
-            input_spec=example_inputs,
-            configs=configs)
-        # 2. load 
-        infer_layer = fluid.dygraph.jit.load(model_path, configs=configs)
-        train_layer.eval()
-        # 3. inference & compare
-        x = fluid.dygraph.to_variable(
-            np.random.random((1, 784)).astype('float32'))
-        self.assertTrue(
-            np.array_equal(train_layer(x).numpy(), infer_layer(x).numpy()))
-
-    def test_model_filename(self):
-        layer = LinearNet(784, 1)
-        model_path = "model.save_load_config.output_spec"
-        configs = fluid.dygraph.jit.SaveLoadConfig()
-        configs.model_filename = "__simplenet__"
-        self.basic_save_load(layer, model_path, configs)
-
-    def test_params_filename(self):
-        layer = LinearNet(784, 1)
-        model_path = "model.save_load_config.params_filename"
-        configs = fluid.dygraph.jit.SaveLoadConfig()
-        configs.params_filename = "__params__"
-        self.basic_save_load(layer, model_path, configs)
-
-    def test_separate_params(self):
-        layer = LinearNet(784, 1)
-        model_path = "model.save_load_config.separate_params"
-        configs = fluid.dygraph.jit.SaveLoadConfig()
-        configs.separate_params = True
-        self.basic_save_load(layer, model_path, configs)
-
     def test_output_spec(self):
         train_layer = LinearNetReturnLoss(8, 8)
         adam = fluid.optimizer.AdamOptimizer(
@@ -427,27 +385,47 @@ def test_output_spec(self):
             adam.minimize(loss)
             train_layer.clear_gradients()
 
-        model_path = "model.save_load_config.output_spec"
-        configs = fluid.dygraph.jit.SaveLoadConfig()
-        configs.output_spec = [out]
-        fluid.dygraph.jit.save(
+        model_path = "save_load_config.output_spec"
+        output_spec = [out]
+        paddle.jit.save(
             layer=train_layer,
-            model_path=model_path,
+            path=model_path,
             input_spec=[x],
-            configs=configs)
+            output_spec=output_spec)
 
         train_layer.eval()
-        infer_layer = fluid.dygraph.jit.load(model_path, configs=configs)
+        infer_layer = paddle.jit.load(model_path)
         x = fluid.dygraph.to_variable(
             np.random.random((4, 8)).astype('float32'))
         self.assertTrue(
             np.array_equal(train_layer(x)[0].numpy(), infer_layer(x).numpy()))
 
+    def test_save_no_support_config_error(self):
+        layer = LinearNet(784, 1)
+        path = "no_support_config_test"
+        with self.assertRaises(ValueError):
+            paddle.jit.save(layer=layer, path=path, model_filename="")
+
+    def test_load_empty_model_filename_error(self):
+        path = "error_model_filename_test"
+        with self.assertRaises(ValueError):
+            paddle.jit.load(path, model_filename="")
+
+    def test_load_empty_params_filename_error(self):
+        path = "error_params_filename_test"
+        with self.assertRaises(ValueError):
+            paddle.jit.load(path, params_filename="")
+
+    def test_load_with_no_support_config(self):
+        path = "no_support_config_test"
+        with self.assertRaises(ValueError):
+            paddle.jit.load(path, separate_params=True)
+
 
 class TestJitMultipleLoading(unittest.TestCase):
     def setUp(self):
         self.linear_size = 4
-        self.model_path = "model.jit_multi_load"
+        self.model_path = "jit_multi_load/model"
         # enable dygraph mode
         fluid.enable_dygraph()
         # config seed
@@ -459,8 +437,8 @@ def setUp(self):
     def train_and_save_orig_model(self):
         layer = LinearNet(self.linear_size, self.linear_size)
         example_inputs, layer, _ = train(layer, self.linear_size, 1)
-        fluid.dygraph.jit.save(
-            layer=layer, model_path=self.model_path, input_spec=example_inputs)
+        paddle.jit.save(
+            layer=layer, path=self.model_path, input_spec=example_inputs)
 
     def test_load_model_retransform_inference(self):
         multi_loaded_layer = MultiLoadingLinearNet(self.linear_size,
@@ -475,7 +453,7 @@ def test_load_model_retransform_inference(self):
 class TestJitPruneModelAndLoad(unittest.TestCase):
     def setUp(self):
         self.linear_size = 4
-        self.model_path = "model.jit_prune_model_and_load"
+        self.model_path = "jit_prune_model_and_load/model"
         # enable dygraph mode
         fluid.enable_dygraph()
         # config seed
@@ -494,13 +472,12 @@ def train_and_save(self):
             adam.minimize(loss)
             train_layer.clear_gradients()
 
-        configs = fluid.dygraph.jit.SaveLoadConfig()
-        configs.output_spec = [hidden]
-        fluid.dygraph.jit.save(
+        output_spec = [hidden]
+        paddle.jit.save(
             layer=train_layer,
-            model_path=self.model_path,
+            path=self.model_path,
             input_spec=[x],
-            configs=configs)
+            output_spec=output_spec)
 
         return train_layer
 
@@ -508,7 +485,7 @@ def test_load_pruned_model(self):
         train_layer = self.train_and_save()
         train_layer.eval()
 
-        infer_layer = fluid.dygraph.jit.load(self.model_path)
+        infer_layer = paddle.jit.load(self.model_path)
 
         x = fluid.dygraph.to_variable(
             np.random.random((4, 8)).astype('float32'))
@@ -519,7 +496,7 @@ def test_load_var_not_in_extra_var_info(self):
         self.train_and_save()
 
         # chage extra var info
-        var_info_path = os.path.join(self.model_path, EXTRA_VAR_INFO_FILENAME)
+        var_info_path = self.model_path + INFER_PARAMS_INFO_SUFFIX
         with open(var_info_path, 'rb') as f:
             extra_var_info = pickle.load(f)
             extra_var_info.clear()
@@ -527,7 +504,7 @@ def test_load_var_not_in_extra_var_info(self):
             pickle.dump(extra_var_info, f, protocol=2)
 
         with self.assertRaises(RuntimeError):
-            fluid.dygraph.jit.load(self.model_path)
+            paddle.jit.load(self.model_path)
 
 
 class TestJitSaveMultiCases(unittest.TestCase):
@@ -561,7 +538,7 @@ def test_no_prune_to_static_after_train(self):
 
         train(layer)
 
-        model_path = "test_no_prune_to_static_after_train"
+        model_path = "test_no_prune_to_static_after_train/model"
         paddle.jit.save(layer, model_path)
 
         self.verify_inference_correctness(layer, model_path)
@@ -569,7 +546,7 @@ def test_no_prune_to_static_after_train(self):
     def test_no_prune_to_static_no_train(self):
         layer = LinearNetWithInputSpec(784, 1)
 
-        model_path = "test_no_prune_to_static_no_train"
+        model_path = "test_no_prune_to_static_no_train/model"
         paddle.jit.save(layer, model_path)
 
         self.verify_inference_correctness(layer, model_path)
@@ -579,7 +556,7 @@ def test_no_prune_no_to_static_after_train(self):
 
         train(layer)
 
-        model_path = "test_no_prune_no_to_static_after_train"
+        model_path = "test_no_prune_no_to_static_after_train/model"
         paddle.jit.save(
             layer,
             model_path,
@@ -593,16 +570,15 @@ def test_no_prune_no_to_static_after_train_with_examples(self):
 
         example_inputs, _, _ = train(layer)
 
-        model_path = "test_no_prune_no_to_static_after_train_with_examples"
-        fluid.dygraph.jit.save(
-            layer=layer, model_path=model_path, input_spec=example_inputs)
+        model_path = "test_no_prune_no_to_static_after_train_with_examples/model"
+        paddle.jit.save(layer=layer, path=model_path, input_spec=example_inputs)
 
         self.verify_inference_correctness(layer, model_path)
 
     def test_no_prune_no_to_static_no_train(self):
         layer = LinearNetNotDeclarative(784, 1)
 
-        model_path = "test_no_prune_no_to_static_no_train"
+        model_path = "test_no_prune_no_to_static_no_train/model"
         paddle.jit.save(
             layer,
             model_path,
@@ -616,9 +592,7 @@ def test_prune_to_static_after_train(self):
 
         out = train_with_label(layer)
 
-        model_path = "test_prune_to_static_after_train"
-        configs = paddle.SaveLoadConfig()
-        configs.output_spec = [out]
+        model_path = "test_prune_to_static_after_train/model"
         paddle.jit.save(
             layer,
             model_path,
@@ -626,18 +600,17 @@ def test_prune_to_static_after_train(self):
                 InputSpec(
                     shape=[None, 784], dtype='float32', name="image")
             ],
-            configs=configs)
+            output_spec=[out])
 
         self.verify_inference_correctness(layer, model_path, True)
 
     def test_prune_to_static_no_train(self):
         layer = LinerNetWithLabel(784, 1)
 
-        model_path = "test_prune_to_static_no_train"
-        configs = paddle.SaveLoadConfig()
+        model_path = "test_prune_to_static_no_train/model"
         # TODO: no train, cannot get output_spec var here
         # now only can use index
-        configs.output_spec = layer.forward.outputs[:1]
+        output_spec = layer.forward.outputs[:1]
         paddle.jit.save(
             layer,
             model_path,
@@ -645,7 +618,7 @@ def test_prune_to_static_no_train(self):
                 InputSpec(
                     shape=[None, 784], dtype='float32', name="image")
             ],
-            configs=configs)
+            output_spec=output_spec)
 
         self.verify_inference_correctness(layer, model_path, True)
 
@@ -654,7 +627,7 @@ def test_no_prune_input_spec_name_warning(self):
 
         train(layer)
 
-        model_path = "test_no_prune_input_spec_name_warning"
+        model_path = "test_no_prune_input_spec_name_warning/model"
         paddle.jit.save(
             layer,
             model_path,
@@ -675,18 +648,16 @@ def test_not_prune_output_spec_name_warning(self):
 
         train(layer)
 
-        model_path = "test_not_prune_output_spec_name_warning"
-        configs = paddle.SaveLoadConfig()
+        model_path = "test_not_prune_output_spec_name_warning/model"
         out = paddle.to_tensor(np.random.random((1, 1)).astype('float'))
-        configs.output_spec = [out]
-        paddle.jit.save(layer, model_path, configs=configs)
+        paddle.jit.save(layer, model_path, output_spec=[out])
 
         self.verify_inference_correctness(layer, model_path)
 
     def test_prune_input_spec_name_error(self):
         layer = LinerNetWithLabel(784, 1)
 
-        model_path = "test_prune_input_spec_name_error"
+        model_path = "test_prune_input_spec_name_error/model"
         with self.assertRaises(ValueError):
             paddle.jit.save(
                 layer,
@@ -707,10 +678,8 @@ def test_prune_output_spec_name_error(self):
 
         train_with_label(layer)
 
-        model_path = "test_prune_to_static_after_train"
-        configs = paddle.SaveLoadConfig()
+        model_path = "test_prune_to_static_after_train/model"
         out = paddle.to_tensor(np.random.random((1, 1)).astype('float'))
-        configs.output_spec = [out]
         with self.assertRaises(ValueError):
             paddle.jit.save(
                 layer,
@@ -719,12 +688,12 @@ def test_prune_output_spec_name_error(self):
                     InputSpec(
                         shape=[None, 784], dtype='float32', name="image")
                 ],
-                configs=configs)
+                output_spec=[out])
 
 
 class TestJitSaveLoadEmptyLayer(unittest.TestCase):
     def setUp(self):
-        self.model_path = "model.jit_save_load_empty_layer"
+        self.model_path = "jit_save_load_empty_layer/model"
         # enable dygraph mode
         paddle.disable_static()
 
@@ -740,7 +709,7 @@ def test_save_load_empty_layer(self):
 
 class TestJitSaveLoadNoParamLayer(unittest.TestCase):
     def setUp(self):
-        self.model_path = "model.jit_save_load_no_param_layer"
+        self.model_path = "jit_save_load_no_param_layer/model"
         # enable dygraph mode
         paddle.disable_static()
 
diff --git a/python/paddle/fluid/tests/unittests/test_layers.py b/python/paddle/fluid/tests/unittests/test_layers.py
index 26073f49bdd3d..e0ec676f1b14c 100644
--- a/python/paddle/fluid/tests/unittests/test_layers.py
+++ b/python/paddle/fluid/tests/unittests/test_layers.py
@@ -1657,21 +1657,6 @@ def test_eye_op(self):
         with self.assertRaises(TypeError):
             layers.eye(num_rows=3, batch_shape=[-1])
 
-    def test_hard_swish(self):
-        with self.static_graph():
-            t = layers.data(name='t', shape=[3, 3], dtype='float32')
-            ret = layers.hard_swish(t)
-            static_ret = self.get_static_graph_result(
-                feed={'t': np.ones(
-                    [3, 3], dtype='float32')}, fetch_list=[ret])[0]
-
-        with self.dynamic_graph():
-            t = np.ones([3, 3], dtype='float32')
-            dy_ret = layers.hard_swish(base.to_variable(t))
-            dy_ret_rlt = dy_ret.numpy()
-
-        self.assertTrue(np.allclose(static_ret, dy_ret_rlt))
-
     def test_while_loop(self):
         with self.static_graph():
             i = layers.fill_constant(shape=[1], dtype='int64', value=0)
@@ -2563,13 +2548,6 @@ def make_l2_normalize(self):
             output = layers.l2_normalize(x, axis=1)
             return output
 
-    def make_maxout(self):
-        with program_guard(fluid.default_main_program(),
-                           fluid.default_startup_program()):
-            data = self._get_data(name='x', shape=[8, 6, 6], dtype="float32")
-            output = layers.maxout(x=data, groups=2)
-            return (output)
-
     def make_crop(self):
         with program_guard(fluid.default_main_program(),
                            fluid.default_startup_program()):
@@ -2656,13 +2634,6 @@ def make_prelu(self):
                 name='prelu')
             return (out)
 
-    def make_brelu(self):
-        with program_guard(fluid.default_main_program(),
-                           fluid.default_startup_program()):
-            input = self._get_data(name="input", shape=[16], dtype="float32")
-            out = layers.brelu(input, t_min=1.0, t_max=20.0, name='brelu')
-            return (out)
-
     def make_soft_relu(self):
         with program_guard(fluid.default_main_program(),
                            fluid.default_startup_program()):
diff --git a/python/paddle/fluid/tests/unittests/test_load_state_dict_from_old_format.py b/python/paddle/fluid/tests/unittests/test_load_state_dict_from_old_format.py
index fdc1e6b52aba1..35ad6fdb30e7b 100644
--- a/python/paddle/fluid/tests/unittests/test_load_state_dict_from_old_format.py
+++ b/python/paddle/fluid/tests/unittests/test_load_state_dict_from_old_format.py
@@ -63,6 +63,8 @@ def setUp(self):
         self.epoch_num = 1
         self.batch_size = 128
         self.batch_num = 10
+        # enable static mode
+        paddle.enable_static()
 
     def train_and_save_model(self, only_params=False):
         with new_program_scope():
@@ -136,13 +138,12 @@ def test_load_with_model_filename(self):
         self.params_filename = None
         orig_param_dict = self.train_and_save_model()
 
-        config = paddle.SaveLoadConfig()
-        config.separate_params = True
-        config.model_filename = self.model_filename
-        load_param_dict, _ = fluid.load_dygraph(self.save_dirname, config)
+        load_param_dict, _ = fluid.load_dygraph(
+            self.save_dirname, model_filename=self.model_filename)
         self.check_load_state_dict(orig_param_dict, load_param_dict)
 
-        new_load_param_dict = paddle.load(self.save_dirname, config)
+        new_load_param_dict = paddle.load(
+            self.save_dirname, model_filename=self.model_filename)
         self.check_load_state_dict(orig_param_dict, new_load_param_dict)
 
     def test_load_with_param_filename(self):
@@ -151,12 +152,12 @@ def test_load_with_param_filename(self):
         self.params_filename = "static_mnist.params"
         orig_param_dict = self.train_and_save_model()
 
-        config = paddle.SaveLoadConfig()
-        config.params_filename = self.params_filename
-        load_param_dict, _ = fluid.load_dygraph(self.save_dirname, config)
+        load_param_dict, _ = fluid.load_dygraph(
+            self.save_dirname, params_filename=self.params_filename)
         self.check_load_state_dict(orig_param_dict, load_param_dict)
 
-        new_load_param_dict = paddle.load(self.save_dirname, config)
+        new_load_param_dict = paddle.load(
+            self.save_dirname, params_filename=self.params_filename)
         self.check_load_state_dict(orig_param_dict, new_load_param_dict)
 
     def test_load_with_model_and_param_filename(self):
@@ -165,13 +166,16 @@ def test_load_with_model_and_param_filename(self):
         self.params_filename = "static_mnist.params"
         orig_param_dict = self.train_and_save_model()
 
-        config = paddle.SaveLoadConfig()
-        config.params_filename = self.params_filename
-        config.model_filename = self.model_filename
-        load_param_dict, _ = fluid.load_dygraph(self.save_dirname, config)
+        load_param_dict, _ = fluid.load_dygraph(
+            self.save_dirname,
+            params_filename=self.params_filename,
+            model_filename=self.model_filename)
         self.check_load_state_dict(orig_param_dict, load_param_dict)
 
-        new_load_param_dict = paddle.load(self.save_dirname, config)
+        new_load_param_dict = paddle.load(
+            self.save_dirname,
+            params_filename=self.params_filename,
+            model_filename=self.model_filename)
         self.check_load_state_dict(orig_param_dict, new_load_param_dict)
 
     def test_load_state_dict_from_save_params(self):
diff --git a/python/paddle/fluid/tests/unittests/test_lstm_cudnn_op.py b/python/paddle/fluid/tests/unittests/test_lstm_cudnn_op.py
index 29a0fa55f7729..82443f8c5493b 100644
--- a/python/paddle/fluid/tests/unittests/test_lstm_cudnn_op.py
+++ b/python/paddle/fluid/tests/unittests/test_lstm_cudnn_op.py
@@ -20,14 +20,44 @@
 
 import paddle.fluid.core as core
 from op_test import OpTest
+import paddle
 import paddle.fluid as fluid
 import paddle.fluid.layers as layers
+import random
+random.seed(2)
+np.set_printoptions(threshold=np.inf)
+paddle.enable_static()
 
 SIGMOID_THRESHOLD_MIN = -40.0
 SIGMOID_THRESHOLD_MAX = 13.0
 EXP_MAX_INPUT = 40.0
 
 
+class RandomWeight:
+    def __init__(self):
+        pass
+
+    def updata_weight(self, hidden_size, input_size, dtype):
+        std = 1.0 / math.sqrt(hidden_size)
+        self.hidden_size = hidden_size
+        self.input_size = input_size
+        self.dtype = dtype
+
+        self.weight_ih = np.random.uniform(
+            low=-std, high=std, size=(4 * self.hidden_size,
+                                      self.input_size)).astype(dtype)
+        self.weight_hh = np.random.uniform(
+            low=-std, high=std, size=(4 * self.hidden_size,
+                                      self.hidden_size)).astype(dtype)
+        self.bias_ih = np.random.uniform(
+            low=-std, high=std, size=(4 * self.hidden_size)).astype(dtype)
+        self.bias_hh = np.random.uniform(
+            low=-std, high=std, size=(4 * self.hidden_size)).astype(dtype)
+
+
+weight = RandomWeight()
+
+
 class LayerMixin(object):
     def __call__(self, *args, **kwargs):
         return self.forward(*args, **kwargs)
@@ -51,16 +81,13 @@ def __init__(self, input_size, hidden_size, bias=True):
         self.bias = bias
         self.dtype = np.float64
         self.parameters = dict()
-        std = 1.0 / math.sqrt(hidden_size)
-        self.weight_ih = np.ones(
-            (4 * hidden_size, input_size), dtype=self.dtype)
-        self.weight_hh = np.ones((4 * hidden_size,
-                                  hidden_size)).astype(self.dtype)
+        self.weight_ih = weight.weight_ih
+        self.weight_hh = weight.weight_hh
         self.parameters['weight_ih'] = self.weight_ih
         self.parameters['weight_hh'] = self.weight_hh
         if bias:
-            self.bias_ih = np.ones((4 * hidden_size)).astype(self.dtype)
-            self.bias_hh = np.ones((4 * hidden_size)).astype(self.dtype)
+            self.bias_ih = weight.bias_ih
+            self.bias_hh = weight.bias_hh
             self.parameters['bias_ih'] = self.bias_ih
             self.parameters['bias_hh'] = self.bias_hh
         else:
@@ -353,24 +380,26 @@ def __init__(self,
 @unittest.skipIf(not core.is_compiled_with_cuda(),
                  "core is not compiled with CUDA")
 class TestCUDNNLstmOp(OpTest):
-    #TODO(GaoWei8): Need to satisfy the result through the new interface
+    def get_weight_names(self):
+        weight_names = []
+        for i in range(2 * self.num_layers):
+            weight_names.append('weight{}'.format(i))
+        for i in range(2 * self.num_layers):
+            weight_names.append('bias{}'.format(i))
+        return weight_names
+
     def setUp(self):
         self.op_type = "cudnn_lstm"
         self.dtype = np.float64
         self.sequence_length = np.array([12, 11, 10, 9, 8], dtype=np.int32)
         self.num_layers = 1
+        self.set_attrs()
 
         seq_length = 12
         batch_size = 5
         input_size = 21
         hidden_size = 21
 
-        input_weight_size = (hidden_size * hidden_size) * 4
-        hidden_weight_size = (hidden_size * hidden_size) * 4
-        weight_size = input_weight_size + hidden_weight_size
-        weight_size += hidden_size * 8
-        weight_size *= self.num_layers
-
         input = np.random.uniform(
             low=-0.1, high=0.1,
             size=(seq_length, batch_size, input_size)).astype(self.dtype)
@@ -379,17 +408,39 @@ def setUp(self):
         input[9][3:][:] = 0
         input[8][4:][:] = 0
 
+        weight.updata_weight(hidden_size, input_size, self.dtype)
         rnn1 = LSTM(
             input_size,
             hidden_size,
-            self.num_layers,
+            num_layers=self.num_layers,
             time_major=True,
             direction="forward")
 
         output, (last_hidden, last_cell) = rnn1(
             input, sequence_length=self.sequence_length)
 
-        flat_w = np.ones((weight_size)).astype(self.dtype)
+        flat_w = []
+        num = 0
+        for i in range(self.num_layers):
+            if i == 0:
+                weight_ih = weight.weight_ih
+            else:
+                weight_ih = weight.weight_hh
+            flat_w.append(("weight" + str(num), weight_ih))
+            num += 1
+        for i in range(self.num_layers):
+            weight_hh = weight.weight_hh
+            flat_w.append(("weight" + str(num), weight_hh))
+            num += 1
+        num = 0
+        for i in range(self.num_layers):
+            bias_ih = weight.bias_ih
+            flat_w.append(("bias" + str(num), bias_ih))
+            num += 1
+        for i in range(self.num_layers):
+            bias_hh = weight.bias_hh
+            flat_w.append(("bias" + str(num), bias_hh))
+            num += 1
         init_h = np.zeros((self.num_layers, batch_size,
                            hidden_size)).astype(self.dtype)
         init_c = np.zeros((self.num_layers, batch_size,
@@ -398,7 +449,7 @@ def setUp(self):
 
         self.inputs = {
             'Input': input,
-            'W': flat_w,
+            'WeightList': flat_w,
             'InitH': init_h,
             'InitC': init_c,
             'SequenceLength': self.sequence_length
@@ -408,7 +459,7 @@ def setUp(self):
             'is_bidirec': False,
             'input_size': input_size,
             'hidden_size': hidden_size,
-            'num_layers': 1,
+            'num_layers': self.num_layers,
         }
         self.outputs = {
             'Out': output,
@@ -428,16 +479,42 @@ def test_output_with_place(self):
 
     def test_grad_with_place(self):
         place = core.CUDAPlace(0)
-        self.check_grad_with_place(place,
-                                   set(['Input', 'W', 'InitH', 'InitC']),
-                                   ['Out', 'LastH', 'LastC'])
+        var_name_list = self.get_weight_names()
+        for var_name in var_name_list:
+            self.check_grad_with_place(
+                place,
+                set(['Input', var_name, 'InitH', 'InitC']),
+                ['Out', 'LastH', 'LastC'])
 
 
 @unittest.skipIf(not core.is_compiled_with_cuda(),
                  "core is not compiled with CUDA")
-class TestCUDNNLstmOp2(TestCUDNNLstmOp):
-    def set_attrs(self):
-        self.num_layers = 2
+class TestCUDNNlstmAPI(unittest.TestCase):
+    def test_lstm(self):
+        seq_len = 20
+        batch_size = 5
+        hidden_size = 20
+        dropout_prob = 0.0
+        num_layers = 1
+        input = fluid.data(
+            name='input',
+            shape=[seq_len, batch_size, hidden_size],
+            dtype='float64')
+        init_h = layers.fill_constant([num_layers, batch_size, hidden_size],
+                                      'float64', 0.0)
+        init_c = layers.fill_constant([num_layers, batch_size, hidden_size],
+                                      'float64', 0.0)
+        rnn_out, last_h, last_c = layers.lstm(input, init_h, init_c, seq_len,
+                                              hidden_size, num_layers,
+                                              dropout_prob, False)
+        exe = fluid.Executor(fluid.CUDAPlace(0))
+        exe.run(fluid.default_startup_program())
+        input_i = np.random.uniform(
+            low=-0.1, high=0.1, size=(seq_len, batch_size,
+                                      hidden_size)).astype("float64")
+        out = exe.run(fluid.default_main_program(),
+                      feed={'input': input_i},
+                      fetch_list=[rnn_out, last_h, last_c, 'cudnn_lstm_0.w_0'])
 
 
 @unittest.skipIf(not core.is_compiled_with_cuda(),
@@ -448,7 +525,7 @@ def test_lstm(self):
         batch_size = 5
         hidden_size = 20
         dropout_prob = 0.0
-        num_layers = 1
+        num_layers = 2
         input = fluid.data(
             name='input',
             shape=[seq_len, batch_size, hidden_size],
diff --git a/python/paddle/fluid/tests/unittests/test_maxout_op.py b/python/paddle/fluid/tests/unittests/test_maxout_op.py
index 6781965b0b4e9..1d38c833773ca 100644
--- a/python/paddle/fluid/tests/unittests/test_maxout_op.py
+++ b/python/paddle/fluid/tests/unittests/test_maxout_op.py
@@ -16,32 +16,43 @@
 
 import unittest
 import numpy as np
+import paddle
 import paddle.fluid as fluid
-from paddle.fluid import Program, program_guard
 import paddle.fluid.core as core
+import paddle.nn.functional as F
 from op_test import OpTest
 
+paddle.enable_static()
+np.random.seed(1)
 
-def maxout_forward_naive(input, groups, channel_axis):
-    s0, s1, s2, s3 = input.shape
-    if channel_axis == 3:
-        return np.ndarray([s0, s1, s2, s3 // groups, groups], \
-            buffer = input, dtype=input.dtype).max(axis=(4))
-    return np.ndarray([s0, s1 // groups, groups, s2, s3], \
-        buffer = input, dtype=input.dtype).max(axis=(2))
+
+def maxout_forward_naive(x, groups, channel_axis):
+    s0, s1, s2, s3 = x.shape
+    if channel_axis == 1:
+        return np.ndarray([s0, s1 // groups, groups, s2, s3], \
+            buffer = x, dtype=x.dtype).max(axis=2)
+    return np.ndarray([s0, s1, s2, s3 // groups, groups], \
+        buffer = x, dtype=x.dtype).max(axis=4)
 
 
 class TestMaxOutOp(OpTest):
     def setUp(self):
         self.op_type = "maxout"
-        self.init_test_case()
-        input = np.random.random(self.shape)
-        output = self.MaxOut_forward_naive(input, self.groups, self.axis)
+        self.dtype = 'float64'
+        self.shape = [3, 6, 2, 4]
+        self.groups = 2
+        self.axis = 1
+        self.set_attrs()
+
+        x = np.random.uniform(-1, 1, self.shape).astype(self.dtype)
+        out = maxout_forward_naive(x, self.groups, self.axis)
 
-        self.inputs = {'X': input}
+        self.inputs = {'X': x}
         self.attrs = {'groups': self.groups, 'axis': self.axis}
+        self.outputs = {'Out': out}
 
-        self.outputs = {'Out': output}
+    def set_attrs(self):
+        pass
 
     def test_check_output(self):
         self.check_output()
@@ -49,65 +60,89 @@ def test_check_output(self):
     def test_check_grad(self):
         self.check_grad(['X'], 'Out')
 
-    def init_test_case(self):
-        self.MaxOut_forward_naive = maxout_forward_naive
-        self.shape = [100, 6, 2, 2]
-        self.groups = 2
-        self.axis = 1
-
 
-class TestMaxOutOpAxis(TestMaxOutOp):
-    def init_test_case(self):
-        self.MaxOut_forward_naive = maxout_forward_naive
-        self.shape = [100, 2, 2, 6]  # NHWC format
-        self.groups = 2
-        self.axis = 3
+class TestMaxOutOpAxis0(TestMaxOutOp):
+    def set_attrs(self):
+        self.axis = -1
 
 
-class TestMaxOutOpAxisAPI(unittest.TestCase):
-    def test_axis(self):
-        data1 = fluid.data(name='data1', shape=[3, 6, 2, 2], dtype='float32')
-        data2 = fluid.data(name='data2', shape=[3, 2, 2, 6], dtype='float32')
-        out1 = fluid.layers.maxout(data1, groups=2, axis=1)
-        out2 = fluid.layers.maxout(data2, groups=2, axis=-1)
-        data1_np = np.random.random((3, 6, 2, 2)).astype("float32")
-        data2_np = np.transpose(data1_np, [0, 2, 3, 1])
+class TestMaxOutOpAxis1(TestMaxOutOp):
+    def set_attrs(self):
+        self.axis = 3
 
-        if core.is_compiled_with_cuda():
-            place = core.CUDAPlace(0)
-        else:
-            place = core.CPUPlace()
-        exe = fluid.Executor(place)
-        exe.run(fluid.default_startup_program())
-        results = exe.run(fluid.default_main_program(),
-                          feed={"data1": data1_np,
-                                "data2": data2_np},
-                          fetch_list=[out1, out2],
-                          return_numpy=True)
 
-        self.assertTrue(
-            np.allclose(results[0], np.transpose(results[1], (0, 3, 1, 2))))
+class TestMaxOutOpFP32(TestMaxOutOp):
+    def set_attrs(self):
+        self.dtype = 'float32'
 
-    def test_exception(self):
-        input = fluid.data(name="input", shape=[2, 4, 6, 6], dtype="float32")
 
-        def _attr_axis():
-            out = fluid.layers.maxout(input, groups=2, axis=2)
+class TestMaxOutOpGroups(TestMaxOutOp):
+    def set_attrs(self):
+        self.groups = 3
 
-        self.assertRaises(ValueError, _attr_axis)
 
+class TestMaxoutAPI(unittest.TestCase):
+    # test paddle.nn.Maxout, paddle.nn.functional.maxout
+    def setUp(self):
+        self.x_np = np.random.uniform(-1, 1, [2, 6, 5, 4]).astype(np.float64)
+        self.groups = 2
+        self.axis = 1
+        self.place=paddle.CUDAPlace(0) if core.is_compiled_with_cuda() \
+            else paddle.CPUPlace()
+
+    def test_static_api(self):
+        with paddle.static.program_guard(paddle.static.Program()):
+            x = paddle.data('X', self.x_np.shape, self.x_np.dtype)
+            out1 = F.maxout(x, self.groups, self.axis)
+            m = paddle.nn.Maxout(self.groups, self.axis)
+            out2 = m(x)
+            exe = paddle.static.Executor(self.place)
+            res = exe.run(feed={'X': self.x_np}, fetch_list=[out1, out2])
+        out_ref = maxout_forward_naive(self.x_np, self.groups, self.axis)
+        for r in res:
+            self.assertTrue(np.allclose(out_ref, r))
+
+    def test_dygraph_api(self):
+        paddle.disable_static(self.place)
+        x = paddle.to_tensor(self.x_np)
+        out1 = F.maxout(x, self.groups, self.axis)
+        m = paddle.nn.Maxout(self.groups, self.axis)
+        out2 = m(x)
+        out_ref = maxout_forward_naive(self.x_np, self.groups, self.axis)
+        for r in [out1, out2]:
+            self.assertTrue(np.allclose(out_ref, r.numpy()))
+
+        out3 = F.maxout(x, self.groups, -1)
+        out3_ref = maxout_forward_naive(self.x_np, self.groups, -1)
+        self.assertTrue(np.allclose(out3_ref, out3.numpy()))
+        paddle.enable_static()
+
+    def test_fluid_api(self):
+        with fluid.program_guard(fluid.Program()):
+            x = fluid.data('X', self.x_np.shape, self.x_np.dtype)
+            out = fluid.layers.maxout(x, groups=self.groups, axis=self.axis)
+            exe = fluid.Executor(self.place)
+            res = exe.run(feed={'X': self.x_np}, fetch_list=[out])
+        out_ref = maxout_forward_naive(self.x_np, self.groups, self.axis)
+        self.assertTrue(np.allclose(out_ref, res[0]))
+
+        paddle.disable_static(self.place)
+        x = paddle.to_tensor(self.x_np)
+        out = paddle.fluid.layers.maxout(x, groups=self.groups, axis=self.axis)
+        self.assertTrue(np.allclose(out_ref, out.numpy()))
+        paddle.enable_static()
 
-class TestMaxOutOpError(unittest.TestCase):
     def test_errors(self):
-        with program_guard(Program()):
+        with paddle.static.program_guard(paddle.static.Program()):
             # The input type must be Variable.
-            self.assertRaises(TypeError, fluid.layers.maxout, 1, 2)
+            self.assertRaises(TypeError, F.maxout, 1)
             # The input dtype must be float16, float32, float64.
-            x_int32 = fluid.data(name='x_int32', shape=[12, 10], dtype='int32')
-            self.assertRaises(TypeError, fluid.layers.maxout, x_int32, 2)
-            # support the input dtype is float32
-            x_fp32 = fluid.data(name='x_fp32', shape=[12, 10], dtype='float32')
-            fluid.layers.maxout(x_fp32, 2)
+            x_int32 = paddle.data(
+                name='x_int32', shape=[2, 4, 6, 8], dtype='int32')
+            self.assertRaises(TypeError, F.maxout, x_int32)
+
+            x_float32 = paddle.data(name='x_float32', shape=[2, 4, 6, 8])
+            self.assertRaises(ValueError, F.maxout, x_float32, 2, 2)
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/tests/unittests/test_nn_grad.py b/python/paddle/fluid/tests/unittests/test_nn_grad.py
index 5d1e016287e07..1675f935f7d6a 100644
--- a/python/paddle/fluid/tests/unittests/test_nn_grad.py
+++ b/python/paddle/fluid/tests/unittests/test_nn_grad.py
@@ -22,8 +22,8 @@
 import paddle.fluid.layers as layers
 import paddle.fluid.core as core
 import gradient_checker
-
 from decorator_helper import prog_scope
+paddle.enable_static()
 
 
 class TestMulGradCheck(unittest.TestCase):
@@ -153,6 +153,38 @@ def test_grad(self):
             self.func(p)
 
 
+class TestMatmulDoubleGradCheck(unittest.TestCase):
+    @prog_scope()
+    def func(self, place):
+        eps = 0.005
+        x_shapes = [[2], [2, 3], [2, 4, 3], [2, 3, 4, 5], [2, 3, 4]]
+        y_shapes = [[2], [3, 2], [2, 4, 5], [2, 3, 3, 5], [4, 3]]
+        transpose_xs = [False, True, True, False, False]
+        transpose_ys = [False, True, False, True, False]
+        dtypes = [np.float64, np.float64, np.float32, np.float32, np.float64]
+        typenames = ["float64", "float64", "float32", "float32", "float64"]
+        for i, (x_shape, y_shape, transpose_x, transpose_y, dtype, typename) \
+            in enumerate(zip(x_shapes, y_shapes, transpose_xs, transpose_ys, dtypes, typenames)):
+            x = layers.create_parameter(
+                dtype=typename, shape=x_shape, name='x{}'.format(i))
+            y = layers.create_parameter(
+                dtype=typename, shape=y_shape, name='y{}'.format(i))
+            out = layers.matmul(
+                x, y, transpose_x, transpose_y, name='out{}'.format(i))
+
+            x_arr = np.random.uniform(-1, 1, x_shape).astype(dtype)
+            y_arr = np.random.uniform(-1, 1, y_shape).astype(dtype)
+            gradient_checker.double_grad_check(
+                [x, y], out, x_init=[x_arr, y_arr], place=place, eps=eps)
+
+    def test_grad(self):
+        places = [fluid.CPUPlace()]
+        if core.is_compiled_with_cuda():
+            places.append(fluid.CUDAPlace(0))
+        for p in places:
+            self.func(p)
+
+
 class TestReshapeDoubleGradCheck(unittest.TestCase):
     @prog_scope()
     def func(self, place):
@@ -249,5 +281,53 @@ def test_grad(self):
             self.func(p)
 
 
+class TestSqueezeDoubleGradCheck(unittest.TestCase):
+    @prog_scope()
+    def func(self, place):
+        x_shape = [1, 3, 1, 40]
+        axes = [0, 2]
+        eps = 0.005
+        dtype = np.float64
+
+        x = layers.data('x', x_shape, False, dtype)
+        x.persistable = True
+        out = paddle.squeeze(x, axes)
+        x_arr = np.random.uniform(-1, 1, x_shape).astype(dtype)
+
+        gradient_checker.double_grad_check(
+            [x], out, x_init=x_arr, place=place, eps=eps)
+
+    def test_grad(self):
+        places = [fluid.CPUPlace()]
+        if core.is_compiled_with_cuda():
+            places.append(fluid.CUDAPlace(0))
+        for p in places:
+            self.func(p)
+
+
+class TestUnsqueezeDoubleGradCheck(unittest.TestCase):
+    @prog_scope()
+    def func(self, place):
+        x_shape = [3, 40]
+        axes = [1, 2]
+        eps = 0.005
+        dtype = np.float64
+
+        x = layers.data('x', x_shape, False, dtype)
+        x.persistable = True
+        out = paddle.unsqueeze(x, axes)
+        x_arr = np.random.uniform(-1, 1, x_shape).astype(dtype)
+
+        gradient_checker.double_grad_check(
+            [x], out, x_init=x_arr, place=place, eps=eps)
+
+    def test_grad(self):
+        places = [fluid.CPUPlace()]
+        if core.is_compiled_with_cuda():
+            places.append(fluid.CUDAPlace(0))
+        for p in places:
+            self.func(p)
+
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_pool2d_op.py b/python/paddle/fluid/tests/unittests/test_pool2d_op.py
index a12a328b653b2..5e8828c3e9126 100644
--- a/python/paddle/fluid/tests/unittests/test_pool2d_op.py
+++ b/python/paddle/fluid/tests/unittests/test_pool2d_op.py
@@ -102,14 +102,21 @@ def avg_pool2D_forward_naive(x,
                 c_start = adaptive_start_index(j, W, ksize[1])
                 c_end = adaptive_end_index(j, W, ksize[1])
             else:
-                r_start = np.max((i * strides[0] - paddings[0], 0))
-                r_end = np.min((i * strides[0] + ksize[0] - paddings[0], H))
-                c_start = np.max((j * strides[1] - paddings[1], 0))
-                c_end = np.min((j * strides[1] + ksize[1] - paddings[1], W))
+                r_start = i * strides[0] - paddings[0]
+                r_end = i * strides[0] + ksize[0] - paddings[0]
+                c_start = j * strides[1] - paddings[1]
+                c_end = j * strides[1] + ksize[1] - paddings[1]
+                field_size = (r_end - r_start) * (c_end - c_start)
+                r_start = np.max((r_start, 0))
+                r_end = np.min((r_end, H))
+                c_start = np.max((c_start, 0))
+                c_end = np.min((c_end, W))
+
             x_masked = x[:, :, r_start:r_end, c_start:c_end]
 
-            field_size = ((r_end - r_start) * (c_end - c_start)) \
-                if (exclusive or adaptive) else (ksize[0] * ksize[1])
+            if (exclusive or adaptive):
+                field_size = (r_end - r_start) * (c_end - c_start)
+
             if data_type == np.int8 or data_type == np.uint8:
                 out[:, :, i, j] = (np.rint(
                     np.sum(x_masked, axis=(2, 3)) /
@@ -207,22 +214,34 @@ def _get_padding_with_SAME(input_shape, pool_size, pool_stride):
                 in_w_start = adaptive_start_index(j, W, ksize[1])
                 in_w_end = adaptive_end_index(j, W, ksize[1])
             else:
-                in_w_start = np.max((j * strides[1] - pad_w_left, 0))
-                in_w_end = np.min((j * strides[1] + ksize[1] - pad_w_left, W))
+                in_h_start = i * strides[0] - pad_h_up
+                in_w_start = j * strides[1] - pad_w_left
+                in_h_end = i * strides[0] + ksize[0] - pad_h_up
+                in_w_end = j * strides[1] + ksize[1] - pad_w_left
+
+                field_size = (in_h_end - in_h_start) * (in_w_end - in_w_start)
+                in_h_start = np.max((in_h_start, 0))
+                in_w_start = np.max((in_w_start, 0))
+                in_h_end = np.min((in_h_end, H))
+                in_w_end = np.min((in_w_end, W))
 
             if data_format == 'NCHW':
                 x_masked = x[:, :, in_h_start:in_h_end, in_w_start:in_w_end]
                 if pool_type == 'avg':
-                    field_size = ((in_h_end - in_h_start) * (in_w_end - in_w_start)) \
-                        if (exclusive or adaptive) else (ksize[0] * ksize[1])
+                    if (exclusive or adaptive):
+                        field_size = (in_h_end - in_h_start) * (
+                            in_w_end - in_w_start)
+
+#                         if (exclusive or adaptive) else (ksize[0] * ksize[1])
                     out[:, :, i, j] = np.sum(x_masked, axis=(2, 3)) / field_size
                 elif pool_type == 'max':
                     out[:, :, i, j] = np.max(x_masked, axis=(2, 3))
             elif data_format == 'NHWC':
                 x_masked = x[:, in_h_start:in_h_end, in_w_start:in_w_end, :]
                 if pool_type == 'avg':
-                    field_size = ((in_h_end - in_h_start) * (in_w_end - in_w_start)) \
-                        if (exclusive or adaptive) else (ksize[0] * ksize[1])
+                    if (exclusive or adaptive):
+                        field_size = (in_h_end - in_h_start) * (
+                            in_w_end - in_w_start)
                     out[:, i, j, :] = np.sum(x_masked, axis=(1, 2)) / field_size
                 elif pool_type == 'max':
                     out[:, i, j, :] = np.max(x_masked, axis=(1, 2))
diff --git a/python/paddle/fluid/tests/unittests/test_pool3d_op.py b/python/paddle/fluid/tests/unittests/test_pool3d_op.py
index 3d139e9b90c10..eab7126c7a422 100644
--- a/python/paddle/fluid/tests/unittests/test_pool3d_op.py
+++ b/python/paddle/fluid/tests/unittests/test_pool3d_op.py
@@ -116,32 +116,44 @@ def _get_padding_with_SAME(input_shape, pool_size, pool_stride):
         if adaptive:
             d_start = adaptive_start_index(k, D, ksize[0])
             d_end = adaptive_end_index(k, D, ksize[0])
-        else:
-            d_start = np.max((k * strides[0] - pad_d_forth, 0))
-            d_end = np.min((k * strides[0] + ksize[0] - pad_d_forth, D))
 
         for i in range(H_out):
             if adaptive:
                 h_start = adaptive_start_index(i, H, ksize[1])
                 h_end = adaptive_end_index(i, H, ksize[1])
-            else:
-                h_start = np.max((i * strides[1] - pad_h_up, 0))
-                h_end = np.min((i * strides[1] + ksize[1] - pad_h_up, H))
 
             for j in range(W_out):
                 if adaptive:
                     w_start = adaptive_start_index(j, W, ksize[2])
                     w_end = adaptive_end_index(j, W, ksize[2])
                 else:
-                    w_start = np.max((j * strides[2] - pad_w_left, 0))
-                    w_end = np.min((j * strides[2] + ksize[2] - pad_w_left, W))
 
+                    d_start = k * strides[0] - pad_d_forth
+                    d_end = np.min((k * strides[0] + ksize[0] - pad_d_forth,
+                                    D + pad_d_back))
+                    h_start = i * strides[1] - pad_h_up
+                    h_end = np.min(
+                        (i * strides[1] + ksize[1] - pad_h_up, H + pad_h_down))
+                    w_start = j * strides[2] - pad_w_left
+                    w_end = np.min((j * strides[2] + ksize[2] - pad_w_left,
+                                    W + pad_w_right))
+
+                    field_size = (d_end - d_start) * (h_end - h_start) * (
+                        w_end - w_start)
+                    w_start = np.max((w_start, 0))
+                    d_start = np.max((d_start, 0))
+                    h_start = np.max((h_start, 0))
+                    w_end = np.min((w_end, W))
+                    d_end = np.min((d_end, D))
+                    h_end = np.min((h_end, H))
                 if data_format == 'NCDHW':
                     x_masked = x[:, :, d_start:d_end, h_start:h_end, w_start:
                                  w_end]
                     if pool_type == 'avg':
-                        field_size = (d_end - d_start) * (h_end - h_start) * (w_end - w_start) \
-                            if (exclusive or adaptive) else ksize[0] * ksize[1] * ksize[2]
+                        if (exclusive or adaptive):
+                            field_size = (d_end - d_start) * (
+                                h_end - h_start) * (w_end - w_start)
+
                         out[:, :, k, i, j] = np.sum(x_masked,
                                                     axis=(2, 3, 4)) / field_size
                     elif pool_type == 'max':
@@ -151,8 +163,10 @@ def _get_padding_with_SAME(input_shape, pool_size, pool_stride):
                     x_masked = x[:, d_start:d_end, h_start:h_end, w_start:
                                  w_end, :]
                     if pool_type == 'avg':
-                        field_size = (d_end - d_start) * (h_end - h_start) * (w_end - w_start) \
-                            if (exclusive or adaptive) else ksize[0] * ksize[1] * ksize[2]
+                        if (exclusive or adaptive):
+                            field_size = (d_end - d_start) * (
+                                h_end - h_start) * (w_end - w_start)
+
                         out[:, k, i, j, :] = np.sum(x_masked,
                                                     axis=(1, 2, 3)) / field_size
                     elif pool_type == 'max':
@@ -564,7 +578,7 @@ def init_exclusive(self):
         self.exclusive = False
 
     def init_paddings(self):
-        self.paddings = [1, 2, 1, 1, 1, 0]
+        self.paddings = [2, 2, 1, 1, 0, 0]
 
 
 @unittest.skipIf(not core.is_compiled_with_cuda(),
diff --git a/python/paddle/fluid/tests/unittests/test_squeeze2_op.py b/python/paddle/fluid/tests/unittests/test_squeeze2_op.py
index a1879c724597e..377f8597cca3b 100644
--- a/python/paddle/fluid/tests/unittests/test_squeeze2_op.py
+++ b/python/paddle/fluid/tests/unittests/test_squeeze2_op.py
@@ -18,6 +18,8 @@
 import numpy as np
 
 from op_test import OpTest
+import paddle
+paddle.enable_static()
 
 
 # Correct: General.
diff --git a/python/paddle/fluid/tests/unittests/test_squeeze_op.py b/python/paddle/fluid/tests/unittests/test_squeeze_op.py
index 5ab13cec540aa..830678fe8f6af 100644
--- a/python/paddle/fluid/tests/unittests/test_squeeze_op.py
+++ b/python/paddle/fluid/tests/unittests/test_squeeze_op.py
@@ -20,6 +20,7 @@
 from paddle.fluid import compiler, Program, program_guard
 import paddle
 from op_test import OpTest
+paddle.enable_static()
 
 
 # Correct: General.
diff --git a/python/paddle/fluid/tests/unittests/test_unsqueeze2_op.py b/python/paddle/fluid/tests/unittests/test_unsqueeze2_op.py
index 340d22acbfb51..eaecf91215cc6 100644
--- a/python/paddle/fluid/tests/unittests/test_unsqueeze2_op.py
+++ b/python/paddle/fluid/tests/unittests/test_unsqueeze2_op.py
@@ -18,6 +18,8 @@
 import numpy as np
 import paddle.fluid as fluid
 from op_test import OpTest
+import paddle
+paddle.enable_static()
 
 
 # Correct: General.
diff --git a/python/paddle/fluid/tests/unittests/test_unsqueeze_op.py b/python/paddle/fluid/tests/unittests/test_unsqueeze_op.py
index 1975e4306026e..f8d27dd42f43b 100644
--- a/python/paddle/fluid/tests/unittests/test_unsqueeze_op.py
+++ b/python/paddle/fluid/tests/unittests/test_unsqueeze_op.py
@@ -19,6 +19,7 @@
 import paddle
 import paddle.fluid as fluid
 from op_test import OpTest
+paddle.enable_static()
 
 
 # Correct: General.
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_accuracy_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_accuracy_op_xpu.py
new file mode 100755
index 0000000000000..7aaa78856811f
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/xpu/test_accuracy_op_xpu.py
@@ -0,0 +1,63 @@
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+import sys
+sys.path.append("..")
+from op_test import OpTest
+import paddle.fluid as fluid
+from paddle.fluid import compiler, Program, program_guard
+import paddle
+
+paddle.enable_static()
+
+
+@unittest.skipIf(not paddle.is_compiled_with_xpu(),
+                 "core is not compiled with XPU")
+class TestXPUAccuracyOp(OpTest):
+    def setUp(self):
+        self.op_type = "accuracy"
+        self.init_dtype()
+        n = 8192
+        infer = np.random.random((n, 1)).astype(self.dtype)
+        indices = np.random.randint(0, 2, (n, 1)).astype('int64')
+        label = np.random.randint(0, 2, (n, 1)).astype('int64')
+        self.inputs = {'Out': infer, 'Indices': indices, "Label": label}
+        num_correct = 0
+        for rowid in range(n):
+            for ele in indices[rowid]:
+                if ele == label[rowid]:
+                    num_correct += 1
+                    break
+        self.outputs = {
+            'Accuracy': np.array([num_correct / float(n)]).astype(self.dtype),
+            'Correct': np.array([num_correct]).astype("int32"),
+            'Total': np.array([n]).astype("int32")
+        }
+        self.attrs = {'use_xpu': True}
+
+    def init_dtype(self):
+        self.dtype = np.float32
+
+    def test_check_output(self):
+        if paddle.is_compiled_with_xpu():
+            place = paddle.XPUPlace(0)
+            self.check_output_with_place(place)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_reshape2_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_reshape2_op_xpu.py
new file mode 100644
index 0000000000000..1a21b0f1972b7
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/xpu/test_reshape2_op_xpu.py
@@ -0,0 +1,207 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+import sys
+
+sys.path.append("..")
+from op_test import OpTest
+import paddle
+import paddle.fluid as fluid
+from paddle.fluid import compiler, Program, program_guard
+
+
+# situation 1: have shape( list, no tensor), no actual shape(Tensor)
+class TestReshapeOp(OpTest):
+    def setUp(self):
+        self.init_data()
+        self.op_type = "reshape2"
+        self.inputs = {"X": np.random.random(self.ori_shape).astype("float32")}
+        self.attrs = {"shape": self.new_shape, "use_xpu": True}
+        self.outputs = {
+            "Out": self.inputs["X"].reshape(self.infered_shape),
+            'XShape': np.random.random(self.ori_shape).astype("float32")
+        }
+
+    def init_data(self):
+        self.ori_shape = (2, 60)
+        self.new_shape = (12, 10)
+        self.infered_shape = (12, 10)
+
+    def test_check_output(self):
+        if paddle.is_compiled_with_xpu():
+            place = paddle.XPUPlace(0)
+            self.check_output_with_place(place, no_check_set=['XShape'])
+
+    def test_check_grad(self):
+        if paddle.is_compiled_with_xpu():
+            place = paddle.XPUPlace(0)
+            self.check_grad_with_place(place, ["X"], "Out")
+
+
+class TestReshapeOpDimInfer1(TestReshapeOp):
+    def init_data(self):
+        self.ori_shape = (5, 25)
+        self.new_shape = (5, -1, 5)
+        self.infered_shape = (5, -1, 5)
+
+
+class TestReshapeOpDimInfer2(TestReshapeOp):
+    def init_data(self):
+        self.ori_shape = (10, 2, 6)
+        self.new_shape = (10, 0, 3, -1)
+        self.infered_shape = (10, 2, 3, -1)
+
+
+# situation 2: have shape(list, no tensor), have actual shape(Tensor)
+class TestReshapeOpWithInputShape(OpTest):
+    def setUp(self):
+        self.init_data()
+        self.op_type = "reshape2"
+
+        self.inputs = {
+            "X": np.random.random(self.ori_shape).astype("float32"),
+            "Shape": np.array(
+                self.actual_shape, dtype="int32")
+        }
+        self.attrs = {"shape": self.new_shape, "use_xpu": True}
+        self.outputs = {
+            "Out": self.inputs["X"].reshape(self.actual_shape),
+            'XShape': np.random.random(self.ori_shape).astype("float32")
+        }
+
+    def init_data(self):
+        self.ori_shape = (6, 20)
+        self.new_shape = (0, -1, 20)
+        self.actual_shape = (2, 3, 20)
+
+    def test_check_output(self):
+        if paddle.is_compiled_with_xpu():
+            place = paddle.XPUPlace(0)
+            self.check_output_with_place(place, no_check_set=['XShape'])
+
+    def test_check_grad(self):
+        if paddle.is_compiled_with_xpu():
+            place = paddle.XPUPlace(0)
+            self.check_grad_with_place(place, ["X"], "Out")
+
+
+# Situation 3: have shape(list, have tensor), no actual shape(Tensor)
+class TestReshapeOp_attr_ShapeTensor(OpTest):
+    def setUp(self):
+        self.init_data()
+        self.op_type = "reshape2"
+
+        shape_tensor = []
+        for index, ele in enumerate(self.new_shape):
+            shape_tensor.append(("x" + str(index), np.ones(
+                (1)).astype('int32') * ele))
+
+        self.inputs = {
+            "X": np.random.random(self.ori_shape).astype("float32"),
+            'ShapeTensor': shape_tensor
+        }
+        self.attrs = {'shape': self.shape, "use_xpu": True}
+        self.outputs = {
+            "Out": self.inputs["X"].reshape(self.infered_shape),
+            'XShape': np.random.random(self.ori_shape).astype("float32")
+        }
+
+    def init_data(self):
+        self.ori_shape = (4, 25)
+        self.new_shape = (10, 10)
+        self.infered_shape = (10, 10)
+        self.shape = (-1, -1)
+
+    def test_check_output(self):
+        if paddle.is_compiled_with_xpu():
+            place = paddle.XPUPlace(0)
+            self.check_output_with_place(place, no_check_set=['XShape'])
+
+    def test_check_grad(self):
+        if paddle.is_compiled_with_xpu():
+            place = paddle.XPUPlace(0)
+            self.check_grad_with_place(place, ["X"], "Out")
+
+
+class TestReshapeOpDimInfer1_attr_ShapeTensor(TestReshapeOp_attr_ShapeTensor):
+    def init_data(self):
+        self.ori_shape = (5, 20)
+        self.new_shape = (5, -1, 20)
+        self.infered_shape = (5, -1, 20)
+        self.shape = (5, -1, -1)
+
+
+class TestReshapeOpDimInfer2_attr_ShapeTensor(TestReshapeOp_attr_ShapeTensor):
+    def init_data(self):
+        self.ori_shape = (10, 2, 6)
+        self.new_shape = (10, 0, 3, -1)
+        self.infered_shape = (10, 2, 3, -1)
+        self.shape = (10, 0, 3, -1)
+
+
+# Situation 4: have shape(Tensor), no actual shape(Tensor)
+class TestReshapeOp_attr_OnlyShape(OpTest):
+    def setUp(self):
+        self.init_data()
+        self.op_type = "reshape2"
+
+        self.inputs = {
+            "X": np.random.random(self.ori_shape).astype("float32"),
+            "Shape": np.array(
+                self.new_shape, dtype="int32")
+        }
+        self.attrs = {"use_xpu": True}
+        self.outputs = {
+            "Out": self.inputs["X"].reshape(self.infered_shape),
+            'XShape': np.random.random(self.ori_shape).astype("float32")
+        }
+
+    def init_data(self):
+        self.ori_shape = (4, 25)
+        self.new_shape = (10, 10)
+        self.infered_shape = (10, 10)
+
+    def test_check_output(self):
+        if paddle.is_compiled_with_xpu():
+            place = paddle.XPUPlace(0)
+            self.check_output_with_place(place, no_check_set=['XShape'])
+
+    def test_check_grad(self):
+        if paddle.is_compiled_with_xpu():
+            place = paddle.XPUPlace(0)
+            self.check_grad_with_place(place, ["X"], "Out")
+
+
+class TestReshapeOpDimInfer1_attr_OnlyShape(TestReshapeOp_attr_OnlyShape):
+    def init_data(self):
+        self.ori_shape = (5, 20)
+        self.new_shape = (5, -1, 10)
+        self.infered_shape = (5, -1, 10)
+        self.shape = (5, -1, -1)
+
+
+class TestReshapeOpDimInfer2_attr_OnlyShape(TestReshapeOp_attr_OnlyShape):
+    def init_data(self):
+        self.ori_shape = (10, 2, 6)
+        self.new_shape = (10, 0, 3, -1)
+        self.infered_shape = (10, 2, 3, -1)
+        self.shape = (10, 0, 3, -1)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_scale_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_scale_op_xpu.py
new file mode 100644
index 0000000000000..1f74fa5e2d685
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/xpu/test_scale_op_xpu.py
@@ -0,0 +1,54 @@
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+import sys
+sys.path.append("..")
+from op_test import OpTest
+import paddle.fluid as fluid
+import paddle.fluid.core as core
+from paddle.fluid.op import Operator
+import paddle
+
+paddle.enable_static()
+
+
+@unittest.skipIf(not paddle.is_compiled_with_xpu(),
+                 "core is not compiled with XPU")
+class TestXPUScaleOp(OpTest):
+    def setUp(self):
+        self.op_type = "scale"
+        self.dtype = np.float32
+        self.inputs = {'X': np.random.random((10, 10)).astype(self.dtype)}
+        self.attrs = {'scale': -2.3, 'use_xpu': True}
+        self.outputs = {
+            'Out': self.inputs['X'] * self.dtype(self.attrs['scale'])
+        }
+
+    def test_check_output(self):
+        if paddle.is_compiled_with_xpu():
+            place = paddle.XPUPlace(0)
+            self.check_output_with_place(place)
+
+    def test_check_grad(self):
+        if paddle.is_compiled_with_xpu():
+            place = paddle.XPUPlace(0)
+            self.check_grad_with_place(place, ['X'], 'Out')
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_shape_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_shape_op_xpu.py
new file mode 100644
index 0000000000000..f194f3ca80cf0
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/xpu/test_shape_op_xpu.py
@@ -0,0 +1,94 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+import sys
+sys.path.append("..")
+from op_test import OpTest
+import paddle
+from paddle.fluid import core
+from paddle.fluid.op import Operator
+
+
+class TestShapeOp(OpTest):
+    def setUp(self):
+        self.op_type = "shape"
+        self.config()
+        self.shape = [2, 3]
+        input = np.zeros(self.shape)
+        self.inputs = {'Input': input}
+        self.outputs = {'Out': np.array(self.shape)}
+
+    def config(self):
+        self.shape = [2, 3]
+
+    def test_check_output(self):
+        if paddle.is_compiled_with_xpu():
+            place = paddle.XPUPlace(0)
+            self.check_output_with_place(place)
+
+
+class case1(TestShapeOp):
+    def config(self):
+        self.shape = [2]
+
+
+class case2(TestShapeOp):
+    def config(self):
+        self.shape = [1, 2, 3]
+
+
+class TestShapeWithSelectedRows(unittest.TestCase):
+    def get_places(self):
+        places = [core.CPUPlace()]
+        if core.is_compiled_with_cuda():
+            places.append(core.CUDAPlace(0))
+        if core.is_compiled_with_xpu():
+            places.append(core.XPUPlace(0))
+        return places
+
+    def check_with_place(self, place):
+        scope = core.Scope()
+        x_rows = [0, 1, 5, 4, 19]
+        height = 20
+        row_numel = 2
+
+        np_array = np.ones((len(x_rows), row_numel)).astype("float32")
+
+        # initialize input variable X
+        x = scope.var('X').get_selected_rows()
+        x.set_rows(x_rows)
+        x.set_height(height)
+        x_tensor = x.get_tensor()
+        x_tensor.set(np_array, place)
+
+        # initialize input variable Out
+        out_shape = scope.var("Out").get_tensor()
+        op = Operator("shape", Input="X", Out="Out")
+
+        op.run(scope, place)
+
+        out_shape = np.array(out_shape).tolist()
+        self.assertListEqual([5, 2], out_shape)
+
+    def test_check_output(self):
+        for place in self.get_places():
+            self.check_with_place(place)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_sign_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_sign_op_xpu.py
new file mode 100644
index 0000000000000..ab07221a07071
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/xpu/test_sign_op_xpu.py
@@ -0,0 +1,54 @@
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+import sys
+sys.path.append("..")
+from op_test import OpTest
+import paddle
+import paddle.fluid as fluid
+from paddle.fluid import Program, program_guard
+import paddle
+
+paddle.enable_static()
+
+
+@unittest.skipIf(not paddle.is_compiled_with_xpu(),
+                 "core is not compiled with XPU")
+class TestXPUSignOp(OpTest):
+    def setUp(self):
+        self.op_type = "sign"
+        self.dtype = np.float32
+        self.inputs = {
+            'X': np.random.uniform(-10, 10, (10, 10)).astype(self.dtype)
+        }
+        self.outputs = {'Out': np.sign(self.inputs['X'])}
+        self.attrs = {'use_xpu': True}
+
+    def test_check_output(self):
+        if paddle.is_compiled_with_xpu():
+            place = paddle.XPUPlace(0)
+            self.check_output_with_place(place)
+
+    def test_check_grad(self):
+        if paddle.is_compiled_with_xpu():
+            place = paddle.XPUPlace(0)
+            self.check_grad_with_place(place, ['X'], 'Out')
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_softmax_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_softmax_op_xpu.py
new file mode 100644
index 0000000000000..92842fbc2e65a
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/xpu/test_softmax_op_xpu.py
@@ -0,0 +1,93 @@
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+import numpy as np
+import sys
+import unittest
+sys.path.append("..")
+from op_test import OpTest
+
+paddle.enable_static()
+np.random.seed(10)
+
+
+def stable_softmax(x):
+    """Compute the softmax of vector x in a numerically stable way."""
+    # clip to shiftx, otherwise, when calc loss with
+    # log(exp(shiftx)), may get log(0)=INF
+    shiftx = (x - np.max(x)).clip(-64.)
+    exps = np.exp(shiftx)
+    return exps / np.sum(exps)
+
+
+def ref_softmax(x, axis=None, dtype=None):
+    x_t = x.copy()
+    if dtype is not None:
+        x_t = x_t.astype(dtype)
+    if axis is None:
+        axis = -1
+    return np.apply_along_axis(stable_softmax, axis, x_t)
+
+
+@unittest.skipIf(not paddle.is_compiled_with_xpu(),
+                 "core is not compiled with XPU")
+class TestXPUSoftmaxOp(OpTest):
+    def setUp(self):
+        self.op_type = "softmax"
+        self.dtype = np.float32
+        self.shape = [2, 3, 4, 5]
+        self.axis = -1
+        self.set_attrs()
+
+        x = np.random.uniform(-1, 1, self.shape).astype(self.dtype)
+        out = np.apply_along_axis(stable_softmax, self.axis, x)
+
+        self.inputs = {'X': x}
+        self.outputs = {'Out': out}
+        self.attrs = {'axis': self.axis, 'use_xpu': True}
+
+    def set_attrs(self):
+        pass
+
+    def test_check_output(self):
+        self.check_output_with_place(paddle.XPUPlace(0), atol=1e-4)
+
+    def test_check_grad(self):
+        self.check_grad_with_place(paddle.XPUPlace(0), ['X'], 'Out')
+
+
+@unittest.skipIf(not paddle.is_compiled_with_xpu(),
+                 "core is not compiled with XPU")
+class TestXPUSoftmaxAxis3(TestXPUSoftmaxOp):
+    def set_attrs(self):
+        self.axis = 3
+
+
+@unittest.skipIf(not paddle.is_compiled_with_xpu(),
+                 "core is not compiled with XPU")
+class TestXPUSoftmax2D(TestXPUSoftmaxOp):
+    def set_attrs(self):
+        self.shape = [10, 12]
+
+
+@unittest.skipIf(not paddle.is_compiled_with_xpu(),
+                 "core is not compiled with XPU")
+class TestXPUSoftmax3D(TestXPUSoftmaxOp):
+    def set_attrs(self):
+        self.shape = [4, 5, 6]
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_sum_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_sum_op_xpu.py
new file mode 100644
index 0000000000000..3bafbf649e6ce
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/xpu/test_sum_op_xpu.py
@@ -0,0 +1,61 @@
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+import sys
+sys.path.append("..")
+from op_test import OpTest
+import paddle
+import paddle.fluid as fluid
+import paddle.fluid.core as core
+from paddle.fluid.op import Operator
+import paddle
+
+paddle.enable_static()
+
+
+@unittest.skipIf(not paddle.is_compiled_with_xpu(),
+                 "core is not compiled with XPU")
+class TestXPUSumOp(OpTest):
+    def setUp(self):
+        self.op_type = "sum"
+        self.use_mkldnn = False
+        self.init_kernel_type()
+        x0 = np.random.random((3, 40)).astype(self.dtype)
+        x1 = np.random.random((3, 40)).astype(self.dtype)
+        x2 = np.random.random((3, 40)).astype(self.dtype)
+        self.inputs = {"X": [("x0", x0), ("x1", x1), ("x2", x2)]}
+        y = x0 + x1 + x2
+        self.outputs = {'Out': y}
+        self.attrs = {'use_mkldnn': self.use_mkldnn, 'use_xpu': True}
+
+    def init_kernel_type(self):
+        self.dtype = np.float32
+
+    def test_check_output(self):
+        if paddle.is_compiled_with_xpu():
+            place = paddle.XPUPlace(0)
+            self.check_output_with_place(place)
+
+    def test_check_grad(self):
+        if paddle.is_compiled_with_xpu():
+            place = paddle.XPUPlace(0)
+            self.check_grad_with_place(place, ['x0'], 'Out')
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/framework/__init__.py b/python/paddle/framework/__init__.py
index 2ce442add2e02..7e2f0eb2fb8bb 100644
--- a/python/paddle/framework/__init__.py
+++ b/python/paddle/framework/__init__.py
@@ -20,8 +20,8 @@
 ]
 
 __all__ += [
-    'grad', 'LayerList', 'load', 'save', 'SaveLoadConfig', 'to_variable',
-    'no_grad', 'DataParallel'
+    'grad', 'LayerList', 'load', 'save', 'to_variable', 'no_grad',
+    'DataParallel'
 ]
 
 __all__ += [
@@ -50,7 +50,6 @@
 from ..fluid.dygraph.base import grad  #DEFINE_ALIAS
 from .io import save
 from .io import load
-from ..fluid.dygraph.jit import SaveLoadConfig  #DEFINE_ALIAS
 from ..fluid.dygraph.parallel import DataParallel  #DEFINE_ALIAS
 
 from ..fluid.dygraph.learning_rate_scheduler import NoamDecay  #DEFINE_ALIAS
diff --git a/python/paddle/framework/io.py b/python/paddle/framework/io.py
index 7175f3101448f..c196c1d689bfe 100644
--- a/python/paddle/framework/io.py
+++ b/python/paddle/framework/io.py
@@ -26,7 +26,9 @@
 from paddle import fluid
 from paddle.fluid import core
 from paddle.fluid.framework import Variable, _varbase_creator, _dygraph_tracer
-from paddle.fluid.dygraph.io import _construct_program_holders, _construct_params_and_buffers, EXTRA_VAR_INFO_FILENAME
+from paddle.fluid.dygraph.jit import _SaveLoadConfig
+from paddle.fluid.dygraph.io import _construct_program_holders, _construct_params_and_buffers
+from paddle.fluid.dygraph.io import INFER_MODEL_SUFFIX, INFER_PARAMS_SUFFIX, INFER_PARAMS_INFO_SUFFIX
 
 __all__ = [
     'save',
@@ -55,19 +57,16 @@ def _load_state_dict_from_save_inference_model(model_path, config):
     # 2. load layer parameters & buffers
     with fluid.dygraph.guard():
         persistable_var_dict = _construct_params_and_buffers(
-            model_path,
-            programs,
-            config.separate_params,
-            config.params_filename,
-            append_suffix=False)
+            model_path, programs, config.params_filename, append_suffix=False)
 
         # 3. construct state_dict
         load_param_dict = dict()
         for var_name in persistable_var_dict:
             load_param_dict[var_name] = persistable_var_dict[var_name].numpy()
 
-        # if __variables.info__ exists, we can recover structured_name
-        var_info_path = os.path.join(model_path, EXTRA_VAR_INFO_FILENAME)
+        # if *.info exists, we can recover structured_name
+        var_info_filename = str(config.params_filename) + ".info"
+        var_info_path = os.path.join(model_path, var_info_filename)
         if os.path.exists(var_info_path):
             with open(var_info_path, 'rb') as f:
                 extra_var_info = pickle.load(f)
@@ -116,12 +115,99 @@ def _load_state_dict_from_save_params(model_path):
     return load_param_dict
 
 
+# NOTE(chenweihang): [ Handling of use cases of API paddle.load ]
+# `paddle.load` may be used to load saved results of:
+# 1. Expected cases:
+#   - need [full filename] when loading
+#       - paddle.save
+#       - paddle.static.save
+#       - paddle.fluid.save_dygraph
+#   - need [prefix] when loading [compatible for paddle 2.x]
+#       - paddle.jit.save
+#       - paddle.static.save_inference_model
+#   - need [directory] when loading [compatible for paddle 1.x]
+#       - paddle.fluid.io.save_inference_model
+#       - paddle.fluid.io.save_params/save_persistable
+# 2. Error cases:
+#   - no error case
+def _build_load_path_and_config(path, config):
+    # NOTE(chenweihang): If both [prefix save format] and [directory save format] exist,
+    # raise error, avoid confusing behavior
+    prefix_format_path = path + INFER_MODEL_SUFFIX
+    prefix_format_exist = os.path.exists(prefix_format_path)
+    directory_format_exist = os.path.isdir(path)
+    if prefix_format_exist and directory_format_exist:
+        raise ValueError(
+            "The %s.pdmodel and %s directory exist at the same time, "
+            "don't know which one to load, please make sure that the specified target "
+            "of ``path`` is unique." % (path, path))
+    elif not prefix_format_exist and not directory_format_exist:
+        error_msg = "The ``path`` (%s) to load model not exists."
+        # if current path is a prefix, and the path.pdparams or path.pdopt
+        # is exist, users may want use `paddle.load` load the result of 
+        # `fluid.save_dygraph`, we raise error here for users
+        params_file_path = path + ".pdparams"
+        opti_file_path = path + ".pdopt"
+        if os.path.exists(params_file_path) or os.path.exists(opti_file_path):
+            error_msg += " If you want to load the results saved by `fluid.save_dygraph`, " \
+                "please specify the full file name, not just the file name prefix. For " \
+                "example, it should be written as `paddle.load('model.pdparams')` instead of " \
+                "`paddle.load('model')`."
+        raise ValueError(error_msg % path)
+    else:
+        if prefix_format_exist:
+            file_prefix = os.path.basename(path)
+            model_path = os.path.dirname(path)
+            if config.model_filename is not None:
+                warnings.warn(
+                    "When loading the result saved with the "
+                    "specified file prefix, the ``model_filename`` config does "
+                    "not take effect.")
+            config.model_filename = file_prefix + INFER_MODEL_SUFFIX
+            if config.params_filename is not None:
+                warnings.warn(
+                    "When loading the result saved with the "
+                    "specified file prefix, the ``params_filename`` config does "
+                    "not take effect.")
+            config.params_filename = file_prefix + INFER_PARAMS_SUFFIX
+        else:
+            # Compatible with the old save_inference_model format
+            model_path = path
+
+    return model_path, config
+
+
+def _parse_load_config(configs):
+    supported_configs = ['model_filename', 'params_filename', 'keep_name_table']
+
+    # input check
+    for key in configs:
+        if key not in supported_configs:
+            raise ValueError(
+                "The additional config (%s) of `paddle.load` is not supported."
+                % key)
+
+    # construct inner config
+    inner_config = _SaveLoadConfig()
+    inner_config.model_filename = configs.get('model_filename', None)
+    inner_config.params_filename = configs.get('params_filename', None)
+    inner_config.keep_name_table = configs.get('keep_name_table', None)
+
+    return inner_config
+
+
 def save(obj, path):
     '''
     Save an object to the specified path.
     
     .. note::
         Now only supports save ``state_dict`` of Layer or Optimizer.
+
+    .. note::
+        ``paddle.save`` will not add a suffix to the saved results, 
+        but we recommend that you use the following paddle standard suffixes:
+        1. for ``Layer.state_dict`` -> ``.pdparams``
+        2. for ``Optimizer.state_dict`` -> ``.pdopt``
     
     Args:
         obj(Object) : The object to be saved.
@@ -178,7 +264,7 @@ def save(obj, path):
         pickle.dump(saved_obj, f, protocol=2)
 
 
-def load(path, config=None):
+def load(path, **configs):
     '''
     Load an object can be used in paddle from specified path.
 
@@ -186,21 +272,39 @@ def load(path, config=None):
         Now only supports load ``state_dict`` of Layer or Optimizer.
 
     .. note::
-        ``paddle.load`` supports loading ``state_dict`` from the result of several 
-        paddle1.x save APIs in static mode, but due to some historical reasons, 
-        if you load ``state_dict`` from the saved result of 
-        ``paddle.static.save_inference_model/paddle.fluid.io.save_params/paddle.fluid.io.save_persistables`` , 
+        ``paddle.load`` supports loading ``state_dict`` of Layer or Optimizer from 
+        the result of other save APIs except ``paddle.load`` , but the argument 
+        ``path`` format is different:
+        1. loading from ``paddle.static.save`` or ``paddle.Model().save(training=True)`` ,  
+        ``path`` needs to be a complete file name, such as ``model.pdparams`` or 
+        ``model.pdopt`` ; 
+        2. loading from ``paddle.jit.save`` or ``paddle.static.save_inference_model`` 
+        or ``paddle.Model().save(training=False)`` , ``path`` need to be a file prefix, 
+        such as ``model/mnist``, and ``paddle.load`` will get information from 
+        ``mnist.pdmodel`` and ``mnist.pdiparams`` ;
+        3. loading from paddle 1.x APIs ``paddle.fluid.io.save_inference_model`` or 
+        ``paddle.fluid.io.save_params/save_persistables`` , ``path`` need to be a 
+        directory, such as ``model`` and model is a directory.
+
+    .. note::
+        If you load ``state_dict`` from the saved result of 
+        ``paddle.static.save`` or ``paddle.static.save_inference_model`` , 
         the structured variable name will cannot be restored. You need to set the argument 
         ``use_structured_name=False`` when using ``Layer.set_state_dict`` later.
 
     Args:
         path(str) : The path to load the target object. Generally, the path is the target 
-            file path, when compatible with loading the saved results of 
-            ``paddle.jit.save/paddle.static.save_inference_model`` , the path is a directory. 
-        config (SaveLoadConfig, optional): :ref:`api_imperative_jit_saveLoadConfig`
-            object that specifies additional configuration options, these options 
-            are for compatibility with ``paddle.jit.save/paddle.static.save_inference_model`` 
-            formats. Default None.
+            file path. When compatible with loading the saved results other APIs, the path 
+            can be a file prefix or directory. 
+        **configs (dict, optional): other load configuration options for compatibility. We do not 
+            recommend using these configurations, they may be removed in the future. If not necessary, 
+            DO NOT use them. Default None.
+            The following options are currently supported:
+            (1) model_filename (string): The inference model file name of the paddle 1.x 
+            ``save_inference_model`` save format. Default file name is :code:`__model__` . 
+            (2) params_filename (string): The persistable variables file name of the paddle 1.x 
+            ``save_inference_model`` save format. No default file name, save variables separately 
+            by default.
 
     Returns:
         Object(Object): a target object can be used in paddle
@@ -227,26 +331,9 @@ def load(path, config=None):
             load_layer_state_dict = paddle.load("emb.pdparams")
             load_opt_state_dict = paddle.load("adam.pdopt")
     '''
-    # 1. input check
-    if not os.path.exists(path):
-        error_msg = "The path `%s` does not exist."
-        # if current path is a prefix, and the path.pdparams or path.pdopt
-        # is exist, users may want use `paddle.load` load the result of 
-        # `fluid.save_dygraph`, we raise error here for users
-        params_file_path = path + ".pdparams"
-        opti_file_path = path + ".pdopt"
-        if os.path.exists(params_file_path) or os.path.exists(opti_file_path):
-            error_msg += " If you want to load the results saved by `fluid.save_dygraph`, " \
-                "please specify the full file name, not just the file name prefix. For " \
-                "example, it should be written as `paddle.load('model.pdparams')` instead of " \
-                "`paddle.load('model')`."
-        raise ValueError(error_msg % path)
-
-    if config is None:
-        config = paddle.SaveLoadConfig()
-
-    # 2. load target
     load_result = None
+    config = _parse_load_config(configs)
+
     if os.path.isfile(path):
         # we think path is file means this file is created by paddle.save
         with open(path, 'rb') as f:
@@ -255,16 +342,15 @@ def load(path, config=None):
 
         if not config.keep_name_table and "StructuredToParameterName@@" in load_result:
             del load_result["StructuredToParameterName@@"]
-    elif os.path.isdir(path):
-        # we think path is directory means compatible with loading 
-        # store results of static mode related save APIs
-
+    else:
+        # file prefix and directory are compatible cases
+        model_path, config = _build_load_path_and_config(path, config)
         # check whether model file exists
         if config.model_filename is None:
             model_filename = '__model__'
         else:
             model_filename = config.model_filename
-        model_file_path = os.path.join(path, model_filename)
+        model_file_path = os.path.join(model_path, model_filename)
 
         if os.path.exists(model_file_path):
             # Load state dict by `jit.save/io.save_inference_model` save format
@@ -274,7 +360,7 @@ def load(path, config=None):
             # `save_inference_model` not save structured name, we need to remind 
             # the user to configure the `use_structured_name` argument when `set_state_dict`
             # NOTE(chenweihang): `jit.save` doesn't save optimizer state 
-            load_result = _load_state_dict_from_save_inference_model(path,
+            load_result = _load_state_dict_from_save_inference_model(model_path,
                                                                      config)
         else:
             # load state dict by `io.save_params/persistables` save format
@@ -283,9 +369,6 @@ def load(path, config=None):
             # mapping info will lost, so users need to give variable list, but users build 
             # variable list in dygraph mode is difficult, we recommend users to use
             # paddle.static.load_program_state in this case
-            load_result = _load_state_dict_from_save_params(path)
-    else:
-        raise ValueError(
-            "Unsupported path format, now only supports file or directory.")
+            load_result = _load_state_dict_from_save_params(model_path)
 
     return load_result
diff --git a/python/paddle/hapi/model.py b/python/paddle/hapi/model.py
index 8505544a71f58..21e3054dde7d7 100644
--- a/python/paddle/hapi/model.py
+++ b/python/paddle/hapi/model.py
@@ -201,8 +201,11 @@ def _init_context():
 
 
 def _update_input_shapes(inputs):
+    "Get input shape list by given inputs in Model initialization."
     shapes = None
-    if isinstance(inputs, list):
+    if isinstance(inputs, Input):
+        shapes = [list(inputs.shape)]
+    elif isinstance(inputs, list):
         shapes = [list(input.shape) for input in inputs]
     elif isinstance(inputs, dict):
         shapes = [list(inputs[name].shape) for name in inputs]
@@ -638,19 +641,14 @@ def train_batch(self, inputs, labels=None):
 
         if self._nranks > 1:
             outputs = self.ddp_model.forward(* [to_variable(x) for x in inputs])
-            losses = self.model._loss(*(to_list(outputs) + labels))
-            losses = to_list(losses)
-            final_loss = fluid.layers.sum(losses)
-            final_loss = self.ddp_model.scale_loss(final_loss)
-            final_loss.backward()
-            self.ddp_model.apply_collective_grads()
         else:
             outputs = self.model.network.forward(
                 * [to_variable(x) for x in inputs])
-            losses = self.model._loss(*(to_list(outputs) + labels))
-            losses = to_list(losses)
-            final_loss = fluid.layers.sum(losses)
-            final_loss.backward()
+
+        losses = self.model._loss(*(to_list(outputs) + labels))
+        losses = to_list(losses)
+        final_loss = fluid.layers.sum(losses)
+        final_loss.backward()
 
         self.model._optimizer.minimize(final_loss)
         self.model.network.clear_gradients()
@@ -922,9 +920,7 @@ def train_batch(self, inputs, labels=None):
         """
         loss = self._adapter.train_batch(inputs, labels)
         if fluid.in_dygraph_mode() and self._input_shapes is None:
-            self._input_shapes = self._adapter._input_shapes
-            self._is_shape_inferred = True
-            self._inputs = self._verify_spec(None, self._input_shapes, True)
+            self._update_inputs()
         return loss
 
     def eval_batch(self, inputs, labels=None):
@@ -972,9 +968,7 @@ def eval_batch(self, inputs, labels=None):
         """
         loss = self._adapter.eval_batch(inputs, labels)
         if fluid.in_dygraph_mode() and self._input_shapes is None:
-            self._input_shapes = self._adapter._input_shapes
-            self._is_shape_inferred = True
-            self._inputs = self._verify_spec(None, self._input_shapes, True)
+            self._update_inputs()
         return loss
 
     def test_batch(self, inputs):
@@ -1017,9 +1011,7 @@ def test_batch(self, inputs):
         """
         loss = self._adapter.test_batch(inputs)
         if fluid.in_dygraph_mode() and self._input_shapes is None:
-            self._input_shapes = self._adapter._input_shapes
-            self._is_shape_inferred = True
-            self._inputs = self._verify_spec(None, self._input_shapes, True)
+            self._update_inputs()
         return loss
 
     def save(self, path, training=True):
@@ -1712,7 +1704,7 @@ def get_inout_spec(all_vars, return_name=False):
                 layer = self.network
                 if self._input_shapes is None:  # No provided or inferred
                     raise RuntimeError(
-                        "Saving inference model needs 'inputs' or running before saving. Please specify 'inputs' in Model initialization or input training zqqdata and perform a training for shape derivation."
+                        "Saving inference model needs 'inputs' or running before saving. Please specify 'inputs' in Model initialization or input training data and perform a training for shape derivation."
                     )
                 if self._is_shape_inferred:
                     warnings.warn(
@@ -1958,3 +1950,9 @@ def _len_data_loader(self, data_loader):
         except Exception:
             steps = None
         return steps
+
+    def _update_inputs(self):
+        "Update self._inputs according to given inputs."
+        self._input_shapes = self._adapter._input_shapes
+        self._is_shape_inferred = True
+        self._inputs = self._verify_spec(None, self._input_shapes, True)
diff --git a/python/paddle/hapi/model_summary.py b/python/paddle/hapi/model_summary.py
index c46a53e910df0..30b22a2f32c34 100644
--- a/python/paddle/hapi/model_summary.py
+++ b/python/paddle/hapi/model_summary.py
@@ -254,7 +254,7 @@ def build_input(input_size, dtypes):
                 dtype = dtypes[0]
             else:
                 dtype = dtypes
-            return paddle.rand(list(input_size), dtype)
+            return paddle.cast(paddle.rand(list(input_size)), dtype)
         else:
             return [
                 build_input(i, dtype) for i, dtype in zip(input_size, dtypes)
diff --git a/python/paddle/nn/__init__.py b/python/paddle/nn/__init__.py
index a2290fe179732..e45222a39915d 100644
--- a/python/paddle/nn/__init__.py
+++ b/python/paddle/nn/__init__.py
@@ -50,6 +50,7 @@
 from .layer.activation import GELU  #DEFINE_ALIAS
 from .layer.activation import Tanh  #DEFINE_ALIAS
 from .layer.activation import Hardshrink  #DEFINE_ALIAS
+from .layer.activation import Hardswish  #DEFINE_ALIAS
 from .layer.activation import Hardtanh  #DEFINE_ALIAS
 from .layer.activation import PReLU  #DEFINE_ALIAS
 from .layer.activation import ReLU  #DEFINE_ALIAS
@@ -57,14 +58,17 @@
 from .layer.activation import SELU  #DEFINE_ALIAS
 from .layer.activation import LeakyReLU  #DEFINE_ALIAS
 from .layer.activation import Sigmoid  #DEFINE_ALIAS
+from .layer.activation import Hardsigmoid  #DEFINE_ALIAS
 from .layer.activation import LogSigmoid
 from .layer.activation import Softmax  #DEFINE_ALIAS
 from .layer.activation import Softplus  #DEFINE_ALIAS
 from .layer.activation import Softshrink  #DEFINE_ALIAS
 from .layer.activation import Softsign  #DEFINE_ALIAS
+from .layer.activation import Swish  #DEFINE_ALIAS
 from .layer.activation import Tanhshrink  #DEFINE_ALIAS
+from .layer.activation import ThresholdedReLU  #DEFINE_ALIAS
 from .layer.activation import LogSoftmax  #DEFINE_ALIAS
-from .layer.activation import HSigmoid  #DEFINE_ALIAS
+from .layer.activation import Maxout  #DEFINE_ALIAS
 from .layer.common import BilinearTensorProduct  #DEFINE_ALIAS
 from .layer.common import Pool2D  #DEFINE_ALIAS
 from .layer.common import Pad2D  #DEFINE_ALIAS
@@ -123,6 +127,7 @@
 # from .layer.loss import NCELoss        #DEFINE_ALIAS
 from .layer.loss import BCEWithLogitsLoss  #DEFINE_ALIAS
 from .layer.loss import CrossEntropyLoss  #DEFINE_ALIAS
+from .layer.loss import HSigmoidLoss  #DEFINE_ALIAS
 from .layer.loss import MSELoss  #DEFINE_ALIAS
 from .layer.loss import L1Loss  #DEFINE_ALIAS
 from .layer.loss import NLLLoss  #DEFINE_ALIAS
diff --git a/python/paddle/nn/functional/__init__.py b/python/paddle/nn/functional/__init__.py
index 93203df359462..32ea5839e7aea 100644
--- a/python/paddle/nn/functional/__init__.py
+++ b/python/paddle/nn/functional/__init__.py
@@ -29,15 +29,13 @@
 __all__ += pooling.__all__
 from . import loss
 __all__ += loss.__all__
-from .activation import brelu  #DEFINE_ALIAS
 from .activation import elu  #DEFINE_ALIAS
 from .activation import erf  #DEFINE_ALIAS
 from .activation import gelu  #DEFINE_ALIAS
 from .activation import hardshrink  #DEFINE_ALIAS
 from .activation import hardtanh  #DEFINE_ALIAS
-from .activation import hard_sigmoid  #DEFINE_ALIAS
-from .activation import hard_swish  #DEFINE_ALIAS
-from .activation import hsigmoid  #DEFINE_ALIAS
+from .activation import hardsigmoid  #DEFINE_ALIAS
+from .activation import hardswish  #DEFINE_ALIAS
 from .activation import leaky_relu  #DEFINE_ALIAS
 from .activation import log_sigmoid  #DEFINE_ALIAS
 from .activation import maxout  #DEFINE_ALIAS
@@ -141,6 +139,7 @@
 from .loss import cross_entropy  #DEFINE_ALIAS
 from .loss import dice_loss  #DEFINE_ALIAS
 from .loss import edit_distance  #DEFINE_ALIAS
+from .loss import hsigmoid_loss  #DEFINE_ALIAS
 from .loss import iou_similarity  #DEFINE_ALIAS
 from .loss import kl_div  #DEFINE_ALIAS
 from .loss import l1_loss  #DEFINE_ALIAS
diff --git a/python/paddle/nn/functional/activation.py b/python/paddle/nn/functional/activation.py
index f7bbe0c94e03d..33ecd29162c12 100644
--- a/python/paddle/nn/functional/activation.py
+++ b/python/paddle/nn/functional/activation.py
@@ -13,27 +13,19 @@
 # limitations under the License.
 
 # TODO: define activation functions of neural network
-from ...fluid.layers import brelu  #DEFINE_ALIAS
 from ...fluid.layers import erf  #DEFINE_ALIAS
-from ...fluid.layers import hard_sigmoid  #DEFINE_ALIAS
-from ...fluid.layers import hard_swish  #DEFINE_ALIAS
-from ...fluid.layers import maxout  #DEFINE_ALIAS
 from ...fluid.layers import soft_relu  #DEFINE_ALIAS
-from ...fluid.layers import swish  #DEFINE_ALIAS
 from ...fluid.layers import sigmoid  #DEFINE_ALIAS
-from ...fluid.layers import thresholded_relu  #DEFINE_ALIAS
 from ...tensor.math import tanh  #DEFINE_ALIAS
 
 __all__ = [
-    'brelu',
     'elu',
     'erf',
     'gelu',
     'hardshrink',
     'hardtanh',
-    'hard_sigmoid',
-    'hard_swish',
-    'hsigmoid',
+    'hardsigmoid',
+    'hardswish',
     'leaky_relu',
     'log_sigmoid',
     'maxout',
@@ -75,10 +67,10 @@ def elu(x, alpha=1.0, name=None):
         alpha (float, optional): The 'alpha' value of the ELU formulation. Default is 1.0.
         name (str, optional): Name for the operation (optional, default is None).
             For more information, please refer to :ref:`api_guide_Name`.
-    
+
     Returns:
         A Tensor with the same data type and shape as ``x`` .
-    
+
     Examples:
         .. code-block:: python
 
@@ -89,7 +81,7 @@ def elu(x, alpha=1.0, name=None):
             paddle.disable_static()
 
             x = paddle.to_tensor(np.array([[-1,6],[1,15.6]]))
-            out = F.elu(x, alpha=0.2) 
+            out = F.elu(x, alpha=0.2)
             # [[-0.12642411  6.        ]
             #  [ 1.          15.6      ]]
     """
@@ -123,16 +115,16 @@ def gelu(x, approximate=False, name=None):
     .. math::
 
         gelu(x) = 0.5 * x * (1 + erf(\\frac{x}{\\sqrt{2}}))
-    
+
     Parameters:
         x (Tensor): The input Tensor with data type float32, float64.
         approximate (bool, optional): Wether to enable approximation. Default is False.
         name (str, optional): Name for the operation (optional, default is None).
             For more information, please refer to :ref:`api_guide_Name`.
-    
+
     Returns:
         A Tensor with the same data type and shape as ``x`` .
-    
+
     Examples:
         .. code-block:: python
 
@@ -265,125 +257,106 @@ def hardtanh(x, min=-1.0, max=1.0, name=None):
     return out
 
 
-def hsigmoid(input,
-             label,
-             weight,
-             bias,
-             num_classes,
-             path_table=None,
-             path_code=None,
-             is_sparse=False):
+def hardsigmoid(x, name=None):
     """
-	:alias_main: paddle.nn.functional.hsigmoid
-	:alias: paddle.nn.functional.hsigmoid,paddle.nn.functional.activation.hsigmoid
-
-    The hierarchical sigmoid organizes the classes into a complete binary tree to reduce the computational complexity
-    and speed up the model training, especially the training of language model.
-    Each leaf node of the complete binary tree represents a class(word) and each non-leaf node acts as a binary classifier.
-    For each class(word), there's a unique path from root to itself, hsigmoid calculate the cost for each non-leaf node on
-    the path, and sum them to get a total cost.
-    Comparing to softmax, the OP can reduce the computational complexity from :math:`O(N)` to :math:`O(logN)`, where :math:`N`
-    represents the number of classes or the size of word dict.
-
-    The OP supports default tree and custom tree. For the default tree, you can refer to `Hierarchical Probabilistic Neural
-    Network Language Model <http://www.iro.umontreal.ca/~lisa/pointeurs/hierarchical-nnlm-aistats05.pdf>`_. For the custom
-    tree, you need to set :attr:`is_custom` to True, and do the following steps (take the language model as an example):
-
-    1. Using a custom word dict to build a binary tree, each leaf node should be an word in the word dict.
-    2. Creating a dict map word_id -> path that from the word to the root node, we call it path_table.
-    3. Creating a dict map word_id -> code of path that from the word to the root node, we call it path_code.
-       Code means the label of each binary classifier, 1 indicate true, 0 indicate false.
-    4. Now, each word should has its path and code along the path, you can pass a batch of path and code related
-       to the same batch of inputs.
+    hardsigmoid activation.
+
+    A 3-part piecewise linear approximation of sigmoid(https://arxiv.org/abs/1603.00391),
+    which is much faster than sigmoid.
+
+    .. math::
+
+        hardsigmoid(x)=
+            \\left\\{
+            \\begin{aligned}
+            &0, & & \\text{if } x \\leq -3 \\\\
+            &1, & & \\text{if } x \\geq 3 \\\\
+            &x/6 + 1/2, & & \\text{otherwise}
+            \\end{aligned}
+            \\right.
 
     Parameters:
-        input (Variable): A tensor with the shape [N, D], where N is the size of mini-batch,
-            and D is the feature size. Its data type supports float32 and float64.
-        label (Variable): A tensor contains the labels of training data. Its shape is [N, 1]
-            and data type is int64.
-        weight (Variable): A tensor with shape (num_classes - 1, D) if not using custom tree(path_code and path_table is None), or (num_classes, D) if using custom tree.
-        bias (Variable): A tensor with shape (num_classes - 1, 1) if not using custom tree(path_code and path_table is None), or (num_classes, 1) if using custom tree.
-        num_classes (int): The number of classes or the size of word dict, must be greater than 2.
-            If the default tree is used (:attr:`is_custom` is set to False), :attr:`num_classes`
-            should not be None. If the custom tree is used (:attr:`is_custom` is set to True),
-            :attr:`num_classes` should be the number of non-leaf nodes, which indicates the num of
-            classes using by the binary classifier.
-        path_table (Variable, optional): A tensor that stores each batch of samples' path from leaf to root
-            node, its shape is [N, L] and data type is int64, where L is the length of path. For each sample i,
-            path_table[i] is a np.array like structure and each element in this array is the indexes in parent
-            nodes' weight matrix. Default: None.
-        path_code (Variable, optional): A tensor that stores each batch of samples' code of path from leaf
-            to root node, its shape is [N, L] and data type is int64, which is the same as :attr:`path_table`.
-            Each code of path is consisted with the code of nodes from leaf to root node. Default: None.
-        is_sparse (bool, optional): Whether use sparse updating instead of dense updating, if it's True, the
-            gradient of W and input will be sparse. Default: False.
+        x (Tensor): The input Tensor with data type float32, float64.
+        name (str, optional): Name for the operation (optional, default is None).
+            For more information, please refer to :ref:`api_guide_Name`.
 
     Returns:
-        Variable: A tensor with the cost of hierarchical sigmoid, its shape is [N, 1] and data type is the same as :attr:`input`.
+        A Tensor with the same data type and shape as ``x`` .
 
     Examples:
         .. code-block:: python
 
-            from paddle import fluid, nn
-            import paddle.fluid.dygraph as dg
+            import paddle
             import paddle.nn.functional as F
-            import numpy as np
 
-            main = fluid.Program()
-            start = fluid.Program()
-            feature_size = 6
-            num_classes = 8
-            with fluid.unique_name.guard():
-                with fluid.program_guard(main, start):
-                    x = fluid.data("input", [-1, feature_size],
-                                  dtype="float32")
-                    label = fluid.data("labels", [-1, 1], dtype="int64")
-                    w = fluid.data("weight", (num_classes -1, feature_size), dtype="float32")
-                    b = fluid.data("bias", (num_classes -1, ), dtype="float32")
-                    y = F.hsigmoid(x, label, w, b, num_classes)
-
-            place = fluid.CPUPlace()
-            exe = fluid.Executor(place)
-            exe.run(start)
-            feed_dict = {
-                "input": np.random.randn(4, feature_size).astype(np.float32),
-                "labels": np.random.randint(0, num_classes, (4, 1)).astype(np.int64),
-                "weight": np.random.randn(num_classes - 1, feature_size).astype(np.float32),
-                "bias": np.random.randn(num_classes - 1, ).astype(np.float32),
-            }
-            y_np, = exe.run(main, feed=feed_dict, fetch_list=[y])
-            print(y_np.shape)
-
-          # (4, 1)
+            x = paddle.to_tensor([-4., 5., 1.])
+            out = F.hardsigmoid(x) # [0., 1., 0.666667]
     """
 
-    attrs = {
-        "num_classes": num_classes,
-        "is_sparse": is_sparse,
-        "remote_prefetch": is_sparse
-    }
+    if in_dygraph_mode():
+        return core.ops.hard_sigmoid(x, 'slope', 0.1666666666666667, 'offset',
+                                     0.5)
 
-    inputs = {
-        "X": input,
-        "W": weight,
-        "Bias": bias,
-        "PathTable": path_table,
-        "PathCode": path_code,
-        "Label": label
-    }
+    check_variable_and_dtype(x, 'x', ['float16', 'float32', 'float64'],
+                             'hardsigmoid')
 
-    helper = LayerHelper('hierarchical_sigmoid', **locals())
-    dtype = helper.input_dtype()
+    helper = LayerHelper('hardsigmoid', **locals())
+    out = helper.create_variable_for_type_inference(x.dtype)
+    helper.append_op(
+        type='hard_sigmoid',
+        inputs={'X': x},
+        outputs={'Out': out},
+        attrs={'slope': 0.1666666666666667,
+               'offset': 0.5})
+    return out
 
-    out = helper.create_variable_for_type_inference(dtype)
-    pre_out = helper.create_variable_for_type_inference(dtype)
-    outputs = {"Out": out, "PreOut": pre_out, "W_Out": weight}
 
-    helper.append_op(
-        type="hierarchical_sigmoid",
-        inputs=inputs,
-        outputs=outputs,
-        attrs=attrs)
+def hardswish(x, name=None):
+    """
+    hardswish activation
+
+    hardswish is proposed in MobileNetV3, and performs better in computational stability
+    and efficiency compared to swish function. For more details please refer
+    to: https://arxiv.org/pdf/1905.02244.pdf
+
+    .. math::
+
+        hardswish(x)=
+            \\left\\{
+            \\begin{aligned}
+            &0, & & \\text{if } x \\leq -3 \\\\
+            &x, & & \\text{if } x \\geq 3 \\\\
+            &\\frac{x(x+3)}{6}, & & \\text{otherwise}
+            \\end{aligned}
+            \\right.
+
+    Parameters:
+        x (Tensor): The input Tensor with data type float32, float64.
+        name (str, optional): Name for the operation (optional, default is None).
+            For more information, please refer to :ref:`api_guide_Name`.
+
+    Returns:
+        A Tensor with the same data type and shape as ``x`` .
+
+    Examples:
+        .. code-block:: python
+
+            import paddle
+            import paddle.nn.functional as F
+
+            x = paddle.to_tensor([-4., 5., 1.])
+            out = F.hardswish(x) # [0., 5., 0.666667]
+    """
+
+    if in_dygraph_mode():
+        return core.ops.hard_swish(x)
+
+    check_variable_and_dtype(x, 'x', ['float16', 'float32', 'float64'],
+                             'hardswish')
+
+    helper = LayerHelper('hardswish', **locals())
+    out = helper.create_variable_for_type_inference(x.dtype)
+    helper.append_op(type='hard_swish', inputs={'X': x}, outputs={'Out': out})
     return out
 
 
@@ -489,7 +462,7 @@ def prelu(x, weight, name=None):
     assert len(weight.shape
                ) == 1, "The dim count of weight shape should be 1 in prelu()."
 
-    # NOTE(): The input of this API should be ``N,C,...`` format, 
+    # NOTE(): The input of this API should be ``N,C,...`` format,
     # which means x.shape[0] is batch_size and x.shape[0] is channel.
     mode = 'all'
     if weight.shape[0] > 1:
@@ -559,15 +532,15 @@ def log_sigmoid(x, name=None):
     .. math::
 
         log\\_sigmoid(x) = log \\frac{1}{1 + e^{-x}}
-    
+
     Parameters:
         x (Tensor): The input Tensor with data type float32, float64.
         name (str, optional): Name for the operation (optional, default is None).
             For more information, please refer to :ref:`api_guide_Name`.
-    
+
     Returns:
         A Tensor with the same data type and shape as ``x`` .
-    
+
     Examples:
         .. code-block:: python
 
@@ -591,6 +564,81 @@ def log_sigmoid(x, name=None):
     return out
 
 
+def maxout(x, groups, axis=1, name=None):
+    """
+    maxout activation.
+
+    Assumed the input shape is (N, Ci, H, W).
+    The output shape is (N, Co, H, W).
+    Then Co = Ci/groups and the operator formula is as follows:
+
+    .. math::
+
+        &out_{si+j} = \\max_{k} x_{gsi + sk + j} \\\\
+        &g = groups \\\\
+        &s = \\frac{input.size}{num\\_channels} \\\\
+        &0 \\le i < \\frac{num\\_channels}{groups} \\\\
+        &0 \\le j < s \\\\
+        &0 \\le k < groups
+
+    Parameters:
+        x (Tensor): The input is 4-D Tensor with shape [N, C, H, W] or [N, H, W, C], the data type
+            of input is float32 or float64.
+        groups (int, optional): The groups number of maxout. `groups` specifies the
+            index of channel dimension where maxout will be performed. This must be
+            a factor of number of features. Default is 1.
+        axis (int, optional): The axis along which to perform maxout calculations.
+            It should be 1 when data format is NCHW, be -1 or 3 when data format
+            is NHWC. If ``axis`` < 0, it works the same way as :math:`axis + D` ,
+            where D is the dimensions of ``x`` . ``axis`` only supports 1, 3 or -1.
+            Default is 1.
+        name (str, optional): Name for the operation (optional, default is None).
+            For more information, please refer to :ref:`api_guide_Name`.
+
+    Returns:
+        A Tensor with the same data type as ``x`` .
+
+    Examples:
+        .. code-block:: python
+
+            import paddle
+            import paddle.nn.functional as F
+
+            x = paddle.rand([1, 2, 3, 4])
+            # [[[[0.5002636  0.22272532 0.17402348 0.2874594 ]
+            #    [0.95313174 0.6228939  0.7129065  0.7087491 ]
+            #    [0.02879342 0.88725346 0.61093384 0.38833922]]
+            #   [[0.5231306  0.03807496 0.91661984 0.15602879]
+            #    [0.666127   0.616567   0.30741522 0.24044901]
+            #    [0.7142536  0.7351477  0.31588817 0.23782359]]]]
+            out = F.maxout(x, groups=2)
+            # [[[[0.5231306  0.22272532 0.91661984 0.2874594 ]
+            #    [0.95313174 0.6228939  0.7129065  0.7087491 ]
+            #    [0.7142536  0.88725346 0.61093384 0.38833922]]]]
+    """
+
+    if in_dygraph_mode():
+        return core.ops.maxout(x, 'groups', groups, 'axis', axis)
+
+    check_variable_and_dtype(x, 'x', ['float32', 'float64'], 'maxout')
+    if axis not in [1, -1, 3]:
+        raise ValueError(
+            "Attr(axis) should be 1 when data format is NCHW, -1 or 3 when data format is NHWC. Received "
+            "Attr(axis): %s." % str(axis))
+    if axis == -1:
+        axis = 3
+
+    helper = LayerHelper('maxout', **locals())
+    out = helper.create_variable_for_type_inference(x.dtype)
+    helper.append_op(
+        type='maxout',
+        inputs={'X': x},
+        outputs={'Out': out},
+        attrs={'groups': groups,
+               'axis': axis})
+    return out
+
+
 def relu6(x, name=None):
     """
     relu6 activation
@@ -614,8 +662,6 @@ def relu6(x, name=None):
             import paddle.nn.functional as F
             import numpy as np
 
-            paddle.disable_static()
-
             x = paddle.to_tensor(np.array([-1, 0.3, 6.5]))
             out = F.relu6(x) # [0, 0.3, 6]
     """
@@ -666,8 +712,6 @@ def selu(x,
             import paddle.nn.functional as F
             import numpy as np
 
-            paddle.disable_static()
-
             x = paddle.to_tensor(np.array([[0.0, 1.0],[2.0, 3.0]]))
             out = F.selu(x) # [[0, 1.050701],[2.101402, 3.152103]]
     """
@@ -778,7 +822,7 @@ def softmax(x, axis=-1, dtype=None, name=None):
             :math:`axis + D` . Default is -1.
         dtype (str|np.dtype|core.VarDesc.VarType, optional): The desired data
             type of the output tensor. If dtype is specified, ``x`` is casted
-            to ``dtype`` before the operation is performed. This is useful for 
+            to ``dtype`` before the operation is performed. This is useful for
             preventing data type overflows. Supported dtype: float32, float64.
             If ``dtype`` is None, the output Tensor has the same dtype as x.
             Default is None.
@@ -881,8 +925,6 @@ def softplus(x, beta=1, threshold=20, name=None):
             import paddle.nn.functional as F
             import numpy as np
 
-            paddle.disable_static()
-
             x = paddle.to_tensor(np.array([-0.4, -0.2, 0.1, 0.3]))
             out = F.softplus(x) # [0.513015, 0.598139, 0.744397, 0.854355]
     """
@@ -930,8 +972,6 @@ def softshrink(x, threshold=0.5, name=None):
             import paddle.nn.functional as F
             import numpy as np
 
-            paddle.disable_static()
-
             x = paddle.to_tensor(np.array([-0.9, -0.2, 0.1, 0.8]))
             out = F.softshrink(x) # [-0.4, 0, 0, 0.3]
     """
@@ -978,8 +1018,6 @@ def softsign(x, name=None):
             import paddle.nn.functional as F
             import numpy as np
 
-            paddle.disable_static()
-
             x = paddle.to_tensor(np.array([-0.4, -0.2, 0.1, 0.3]))
             out = F.softsign(x) # [-0.285714, -0.166667, 0.0909091, 0.230769]
     """
@@ -994,6 +1032,47 @@ def softsign(x, name=None):
     return out
 
 
+def swish(x, name=None):
+    """
+    swish activation.
+
+    .. math::
+
+        swish(x) = \\frac{x}{1 + e^{-x}}
+
+    Parameters:
+        x (Tensor): The input Tensor with data type float32, float64.
+        name (str, optional): Name for the operation (optional, default is None).
+            For more information, please refer to :ref:`api_guide_Name`.
+
+    Returns:
+        A Tensor with the same data type and shape as ``x`` .
+
+    Examples:
+        .. code-block:: python
+
+            import paddle
+            import paddle.nn.functional as F
+            import numpy as np
+
+            x = paddle.to_tensor(np.array([-2., 0., 1.]))
+            out = F.swish(x) # [-0.238406, 0., 0.731059]
+    """
+
+    if in_dygraph_mode():
+        return core.ops.swish(x, 'beta', 1.0)
+
+    check_variable_and_dtype(x, 'x', ['float16', 'float32', 'float64'], 'swish')
+    helper = LayerHelper('swish', **locals())
+    out = helper.create_variable_for_type_inference(x.dtype)
+    helper.append_op(
+        type='swish',
+        inputs={'X': x},
+        outputs={'Out': out},
+        attrs={'beta': 1.0})
+    return out
+
+
 def tanhshrink(x, name=None):
     """
     tanhshrink activation
@@ -1017,8 +1096,6 @@ def tanhshrink(x, name=None):
             import paddle.nn.functional as F
             import numpy as np
 
-            paddle.disable_static()
-
             x = paddle.to_tensor(np.array([-0.4, -0.2, 0.1, 0.3]))
             out = F.tanhshrink(x) # [-0.020051, -0.00262468, 0.000332005, 0.00868739]
     """
@@ -1033,6 +1110,52 @@ def tanhshrink(x, name=None):
     return out
 
 
+def thresholded_relu(x, threshold=1.0, name=None):
+    """
+    thresholded relu activation.
+
+    .. math::
+
+        thresholded\\_relu(x) = \\begin{cases}
+                                 x, \\text{if } x > threshold \\\\
+                                 0, \\text{otherwise}
+                                \\end{cases}
+
+    Parameters:
+        x (Tensor): The input Tensor with data type float32, float64.
+        threshold (float, optional): The value of threshold for thresholded_relu. Default is 1.0
+        name (str, optional): Name for the operation (optional, default is None).
+            For more information, please refer to :ref:`api_guide_Name`.
+
+    Returns:
+        A Tensor with the same data type and shape as ``x`` .
+
+    Examples:
+        .. code-block:: python
+
+            import paddle
+            import paddle.nn.functional as F
+            import numpy as np
+
+            x = paddle.to_tensor(np.array([2., 0., 1.]))
+            out = F.thresholded_relu(x) # [2., 0., 0.]
+    """
+
+    if in_dygraph_mode():
+        return core.ops.thresholded_relu(x, 'threshold', threshold)
+
+    check_variable_and_dtype(x, 'x', ['float16', 'float32', 'float64'],
+                             'thresholded_relu')
+    helper = LayerHelper('thresholded_relu', **locals())
+    out = helper.create_variable_for_type_inference(x.dtype)
+    helper.append_op(
+        type='thresholded_relu',
+        inputs={'X': x},
+        outputs={'Out': out},
+        attrs={'threshold': threshold})
+    return out
+
+
 def log_softmax(x, axis=-1, dtype=None, name=None):
     """
     This operator implements the log_softmax layer. The calculation process is
@@ -1051,13 +1174,13 @@ def log_softmax(x, axis=-1, dtype=None, name=None):
             :math:`axis + D` . Default is -1.
         dtype (str|np.dtype|core.VarDesc.VarType, optional): The desired data
             type of the output tensor. If dtype is specified, ``x`` is casted
-            to ``dtype`` before the operation is performed. This is useful for 
+            to ``dtype`` before the operation is performed. This is useful for
             preventing data type overflows. Supported dtype: float32, float64.
             If ``dtype`` is None, the output Tensor has the same dtype as x.
             Default is None.
         name (str, optional): Name for the operation (optional, default is None).
             For more information, please refer to :ref:`api_guide_Name`.
- 
+
     Returns:
         A Tensor with the same shape and data type (use ``dtype`` if it is
         specified) as x.
diff --git a/python/paddle/nn/functional/loss.py b/python/paddle/nn/functional/loss.py
index c4b5606dddcf1..d085213dffc23 100644
--- a/python/paddle/nn/functional/loss.py
+++ b/python/paddle/nn/functional/loss.py
@@ -54,6 +54,7 @@
     'cross_entropy',
     'dice_loss',
     'edit_distance',
+    'hsigmoid_loss',
     'iou_similarity',
     'kl_div',
     'l1_loss',
@@ -343,6 +344,138 @@ def binary_cross_entropy_with_logits(logit,
     return out
 
 
+def hsigmoid_loss(input,
+                  label,
+                  num_classes,
+                  weight,
+                  bias=None,
+                  path_table=None,
+                  path_code=None,
+                  is_sparse=False,
+                  name=None):
+    """
+    The hierarchical sigmoid organizes the classes into a complete binary tree to reduce the computational complexity
+    and speed up the model training, especially the training of language model.
+    Each leaf node of the complete binary tree represents a class(word) and each non-leaf node acts as a binary classifier.
+    For each class(word), there's a unique path from root to itself, hsigmoid calculate the cost for each non-leaf node on
+    the path, and sum them to get a total cost.
+    Comparing to softmax, the OP can reduce the computational complexity from :math:`O(N)` to :math:`O(logN)`, where :math:`N`
+    represents the number of classes or the size of word dict.
+
+    The OP supports default tree and custom tree. For the default tree, you can refer to `Hierarchical Probabilistic Neural
+    Network Language Model <http://www.iro.umontreal.ca/~lisa/pointeurs/hierarchical-nnlm-aistats05.pdf>`_. For the custom
+    tree, you need to set :attr:`is_custom` to True, and do the following steps (take the language model as an example):
+
+    1. Using a custom word dict to build a binary tree, each leaf node should be an word in the word dict.
+    2. Creating a dict map word_id -> path that from the word to the root node, we call it path_table.
+    3. Creating a dict map word_id -> code of path that from the word to the root node, we call it path_code.
+       Code means the label of each binary classifier, 1 indicate true, 0 indicate false.
+    4. Now, each word should has its path and code along the path, you can pass a batch of path and code related
+       to the same batch of inputs.
+
+    Parameters:
+        input (Tensor): A tensor with the shape [N, D], where N is the size of mini-batch,
+            and D is the feature size. Its data type supports float32 or float64.
+        label (Tensor): A tensor contains the labels of training data. Its shape is [N, 1]
+            and data type is int64.
+        num_classes (int): The number of classes or the size of word dict, must be greater than 2.
+            If the default tree is used (path_code and path_table is None are None), `num_classes`
+            should not be None. If the custom tree is used (path_code and path_table is None are not None),
+            `num_classes` should be the number of non-leaf nodes, which indicates the num of
+            classes using by the binary classifier.
+        weight (Tensor): A tensor with shape (num_classes - 1, D), with the same data type as `input`.
+        bias (Tensor, optional): A tensor with shape (num_classes - 1, 1), with the same data type as `input`.
+            If `bias` is None, no bias will be add. Default is None.
+        path_table (Tensor, optional): A tensor that stores each batch of samples' path from leaf to root
+            node, its shape is [N, L] and data type is int64, where L is the length of path. For each sample i,
+            path_table[i] is a np.array like structure and each element in this array is the indexes in parent
+            nodes' weight matrix. If `path_table` and `path_code` are None, the default tree will be used.
+            Default is None.
+        path_code (Tensor, optional): A tensor that stores each batch of samples' code of path from leaf
+            to root node, its shape is [N, L] and data type is int64, which is the same as :attr:`path_table`.
+            Each code of path is consisted with the code of nodes from leaf to root node. If `path_table` and
+            `path_code` are None, the default tree will be used. Default is None.
+        is_sparse (bool, optional): Whether use sparse updating instead of dense updating. If `is_sparse` is True,
+            the gradient of `weight` and `input` will be sparse. Default is False.
+        name (str, optional): Name for the operation (optional, default is None).
+            For more information, please refer to :ref:`api_guide_Name`.
+
+    Returns:
+        A tensor with the cost of hierarchical sigmoid, its shape is [N, 1] and data type is the same as `input`.
+
+    Examples:
+        .. code-block:: python
+
+            import paddle
+            import paddle.nn.functional as F
+
+            paddle.set_device('cpu')
+
+            input = paddle.uniform([2, 3])
+            # [[-0.8018668   0.8736385  -0.9064771 ] # random
+            #  [-0.10228515 -0.87188244 -0.8783718 ]] # random
+            label = paddle.to_tensor([0, 1, 4, 5])
+            num_classes = 5
+            weight=paddle.uniform([num_classes-1, 3])
+            # [[-0.24148715  0.8449961  -0.7399121 ] # random
+            #  [-0.9800559   0.43509364  0.9091208 ] # random
+            #  [ 0.60194826  0.10430074 -0.4521166 ] # random
+            #  [-0.4469818  -0.01536179 -0.604454  ]] # random
+
+            out=F.hsigmoid_loss(input, label, num_classes, weight)
+            # [[3.0159328]
+            #  [2.2407534]]
+    """
+
+    if in_dygraph_mode():
+        out, _, _ = core.ops.hierarchical_sigmoid(
+            input, weight, label, path_table, path_code, bias, 'num_classes',
+            num_classes, 'is_sparse', is_sparse, 'remote_prefetch', is_sparse)
+        return out
+
+    check_variable_and_dtype(input, 'input', ['float32', 'float64'],
+                             'hsigmoid_loss')
+    check_variable_and_dtype(label, 'label', ['int64'], 'hsigmoid_loss')
+    check_variable_and_dtype(weight, 'weight', ['float32', 'float64'],
+                             'hsigmoid_loss')
+    if bias is not None:
+        check_variable_and_dtype(bias, 'bias', ['float32', 'float64'],
+                                 'hsigmoid_loss')
+    if path_table is not None:
+        check_variable_and_dtype(path_table, 'path_table', ['int64'],
+                                 'hsigmoid_loss')
+    if path_code is not None:
+        check_variable_and_dtype(path_code, 'path_code', ['int64'],
+                                 'hsigmoid_loss')
+
+    attrs = {
+        "num_classes": num_classes,
+        "is_sparse": is_sparse,
+        "remote_prefetch": is_sparse
+    }
+
+    inputs = {
+        "X": input,
+        "W": weight,
+        "Bias": bias,
+        "PathTable": path_table,
+        "PathCode": path_code,
+        "Label": label
+    }
+
+    helper = LayerHelper('hsigmoid_loss', **locals())
+    out = helper.create_variable_for_type_inference(input.dtype)
+    pre_out = helper.create_variable_for_type_inference(input.dtype)
+    outputs = {"Out": out, "PreOut": pre_out, "W_Out": weight}
+
+    helper.append_op(
+        type="hierarchical_sigmoid",
+        inputs=inputs,
+        outputs=outputs,
+        attrs=attrs)
+    return out
+
+
 def smooth_l1_loss(input, label, reduction='mean', delta=1.0, name=None):
     """
     This operator calculates smooth_l1_loss. Creates a criterion that uses a squared
diff --git a/python/paddle/nn/initializer/__init__.py b/python/paddle/nn/initializer/__init__.py
index 489f324868a3e..db0f5dbff2b80 100644
--- a/python/paddle/nn/initializer/__init__.py
+++ b/python/paddle/nn/initializer/__init__.py
@@ -13,21 +13,23 @@
 # limitations under the License.
 
 # TODO: define the initializers to create a Parameter in neural network
-
 from ...fluid.initializer import Bilinear  #DEFINE_ALIAS
-from ...fluid.initializer import Constant  #DEFINE_ALIAS
 from ...fluid.initializer import MSRA  #DEFINE_ALIAS
 from ...fluid.initializer import Normal  #DEFINE_ALIAS
 from ...fluid.initializer import TruncatedNormal  #DEFINE_ALIAS
 from ...fluid.initializer import Uniform  #DEFINE_ALIAS
 from ...fluid.initializer import Xavier  #DEFINE_ALIAS
 
+from . import constant
+from .constant import Constant  #DEFINE_ALIAS
+
 __all__ = [
     'Bilinear',
-    'Constant',
     'MSRA',
     'Normal',
     'TruncatedNormal',
     'Uniform',
     'Xavier',
 ]
+
+__all__ += constant.__all__
diff --git a/python/paddle/nn/initializer/constant.py b/python/paddle/nn/initializer/constant.py
new file mode 100644
index 0000000000000..6d21ddae0d16b
--- /dev/null
+++ b/python/paddle/nn/initializer/constant.py
@@ -0,0 +1,46 @@
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# TODO: define the initializers of Constant in neural network
+from ...fluid.initializer import ConstantInitializer
+
+__all__ = ['Constant']
+
+
+class Constant(ConstantInitializer):
+    """Implement the constant initializer.
+
+    Args:
+        value (float32): constant value to initialize the parameter 
+
+    Examples:
+        .. code-block:: python
+
+            import paddle
+            import paddle.nn as nn
+
+            data = paddle.rand([30, 10, 2], dtype='float32')
+            linear = nn.Linear(2,
+                               4,
+                               weight_attr=nn.initializer.Constant(value=2.0))
+            res = linear(data)
+            print(linear.weight.numpy())
+            #result is [[2. 2. 2. 2.],[2. 2. 2. 2.]]
+
+    """
+
+    def __init__(self, value=0.0):
+        if value is None:
+            raise ValueError("value must not be none.")
+        super(Constant, self).__init__(value=value, force_cpu=False)
diff --git a/python/paddle/nn/layer/__init__.py b/python/paddle/nn/layer/__init__.py
index 760af09f1f2f5..3a5bcaa21fe5b 100644
--- a/python/paddle/nn/layer/__init__.py
+++ b/python/paddle/nn/layer/__init__.py
@@ -41,7 +41,6 @@
 from .activation import Sigmoid  #DEFINE_ALIAS
 # from .activation import Softmax        #DEFINE_ALIAS
 from .activation import LogSoftmax  #DEFINE_ALIAS
-from .activation import HSigmoid  #DEFINE_ALIAS
 from .common import BilinearTensorProduct  #DEFINE_ALIAS
 from .common import Bilinear  #DEFINE_ALIAS
 from .common import Pool2D  #DEFINE_ALIAS
diff --git a/python/paddle/nn/layer/activation.py b/python/paddle/nn/layer/activation.py
index 585d369c607e5..dbb9d00f365cf 100644
--- a/python/paddle/nn/layer/activation.py
+++ b/python/paddle/nn/layer/activation.py
@@ -18,6 +18,7 @@
     'ELU',
     'GELU',
     'Hardshrink',
+    'Hardswish',
     'Tanh',
     'Hardtanh',
     'PReLU',
@@ -26,14 +27,17 @@
     'SELU',
     'LeakyReLU',
     'Sigmoid',
+    'Hardsigmoid',
     'Softmax',
     'Softplus',
     'Softshrink',
     'Softsign',
+    'Swish',
     'Tanhshrink',
+    'ThresholdedReLU',
     'LogSigmoid',
     'LogSoftmax',
-    'HSigmoid',
+    'Maxout',
 ]
 
 from ...fluid.dygraph import layers
@@ -50,18 +54,18 @@ class ELU(layers.Layer):
     ELU Activation.
 
     .. math::
-    
+
         ELU(x) = max(0, x) + min(0, \\alpha * (e^{x}-1))
 
     Parameters:
         alpha (float, optional): The 'alpha' value of the ELU formulation. Default is 1.0.
         name (str, optional): Name for the operation (optional, default is None).
             For more information, please refer to :ref:`api_guide_Name`.
-    
+
     Shape:
         - input: Tensor with any shape.
         - output: Tensor with the same shape as input.
-    
+
     Examples:
         .. code-block:: python
 
@@ -106,11 +110,11 @@ class GELU(layers.Layer):
         approximate (bool, optional): Wether to enable approximation. Default is False.
         name (str, optional): Name for the operation (optional, default is None).
             For more information, please refer to :ref:`api_guide_Name`.
-    
+
     Shape:
         - input: Tensor with any shape.
         - output: Tensor with the same shape as input.
-    
+
     Examples:
         .. code-block:: python
 
@@ -120,7 +124,7 @@ class GELU(layers.Layer):
             paddle.disable_static()
 
             x = paddle.to_tensor(np.array([[-1, 0.5],[1, 1.5]]))
-            
+
             m = paddle.nn.GELU()
             out = m(x) # [-0.158655 0.345731 0.841345 1.39979]
 
@@ -184,6 +188,52 @@ def forward(self, x):
         return F.hardshrink(x, self._threshold, self._name)
 
 
+class Hardswish(layers.Layer):
+    """
+    Hardswish activation
+
+    Hardswish is proposed in MobileNetV3, and performs better in computational stability
+    and efficiency compared to swish function. For more details please refer
+    to: https://arxiv.org/pdf/1905.02244.pdf
+
+    .. math::
+
+        Hardswish(x)=
+            \\left\\{
+            \\begin{aligned}
+            &0, & & \\text{if } x \\leq -3 \\\\
+            &x, & & \\text{if } x \\geq 3 \\\\
+            &\\frac{x(x+3)}{6}, & & \\text{otherwise}
+            \\end{aligned}
+            \\right.
+
+    Parameters:
+        name (str, optional): Name for the operation (optional, default is None).
+            For more information, please refer to :ref:`api_guide_Name`.
+
+    Shape:
+        - input: Tensor with any shape.
+        - output: Tensor with the same shape as input.
+
+    Examples:
+
+        .. code-block:: python
+
+            import paddle
+
+            x = paddle.to_tensor([-4., 5., 1.])
+            m = paddle.nn.Hardswish()
+            out = m(x) # [0., 5., 0.666667]
+    """
+
+    def __init__(self, name=None):
+        super(Hardswish, self).__init__()
+        self._name = name
+
+    def forward(self, x):
+        return F.hardswish(x, self._name)
+
+
 class Tanh(layers.Layer):
     """
     Tanh Activation.
@@ -240,11 +290,11 @@ class Hardtanh(layers.Layer):
         max (float, optional): The value of max for Hardtanh. Default is 1.
         name (str, optional): Name for the operation (optional, default is None).
             For more information, please refer to :ref:`api_guide_Name`.
-    
+
     Shape:
         - input: Tensor with any shape.
         - output: Tensor with the same shape as input.
-    
+
     Examples:
         .. code-block:: python
 
@@ -268,142 +318,6 @@ def forward(self, x):
         return F.hardtanh(x, self._min, self._max, self._name)
 
 
-class HSigmoid(layers.Layer):
-    """
-	:alias_main: paddle.nn.HSigmoid
-	:alias: paddle.nn.HSigmoid,paddle.nn.layer.HSigmoid,paddle.nn.layer.activation.HSigmoid
-
-    Hierarchical Sigmoid Layer.
-    
-    The hierarchical sigmoid organizes the classes into a complete binary tree to reduce the computational complexity
-    and speed up the model training, especially the training of language model.
-    Each leaf node of the complete binary tree represents a class(word) and each non-leaf node acts as a binary classifier.
-    For each class(word), there's a unique path from root to itself, hsigmoid calculate the cost for each non-leaf node on
-    the path, and sum them to get a total cost.
-    Comparing to softmax, the OP can reduce the computational complexity from :math:`O(N)` to :math:`O(logN)`, where :math:`N`
-    represents the number of classes or the size of word dict.
-
-    The OP supports default tree and custom tree. For the default tree, you can refer to `Hierarchical Probabilistic Neural
-    Network Language Model <http://www.iro.umontreal.ca/~lisa/pointeurs/hierarchical-nnlm-aistats05.pdf>_`. For the custom
-    tree, you need to set :attr:`is_custom` to True, and do the following steps (take the language model as an example):
-
-    1. Using a custom word dict to build a binary tree, each leaf node should be an word in the word dict.
-    2. Creating a dict map word_id -> path that from the word to the root node, we call it path_table.
-    3. Creating a dict map word_id -> code of path that from the word to the root node, we call it path_code.
-       Code means the label of each binary classifier, 1 indicate true, 0 indicate false.
-    4. Now, each word should has its path and code along the path, you can pass a batch of path and code related
-       to the same batch of inputs.
-
-    Parameters:
-        feature_size (int): The feature size.
-        num_classes (int): The number of classes or the size of word dict, must be greater than 2.
-            If the default tree is used (:attr:`is_custom` is set to False), :attr:`num_classes`
-            should not be None. If the custom tree is used (:attr:`is_custom` is set to True),
-            :attr:`num_classes` should be the number of non-leaf nodes, which indicates the num of
-            classes using by the binary classifier.
-        param_attr (ParamAttr, optional): The parameter attribute for the learnable parameters/weights
-            of hsigmoid. If it is set to None or one attribute of ParamAttr, hsigmoid will create a
-            ParamAttr as param_attr. If the Initializer of the param_attr is not set, the parameter is
-            initialized with Xavier. Default: None.
-        bias_attr (ParamAttr|bool, optional): The parameter attribute for the bias of hsigmoid. If it
-            is set to False, no bias will be added. If it is set to None or one attribute of ParamAttr,
-            hsigmoid will create a ParamAttr as bias_attr. If the Initializer of the bias_attr is not
-            set, the bias is initialized zero. Default: None.
-        is_custom (bool, optional): Whether use custom binary tree. If it's True, `path_table` and 
-            `path_code` should be passed to its forward method, otherwise `path_table` and `path_code`
-            should not be passed to its forward method. Default: False.
-        is_sparse (bool, optional): Whether use sparse updating instead of dense updating, if it's True, the
-            gradient of W and input will be sparse. Default: False.
-
-    Returns:
-        None
-
-    Examples:
-        .. code-block:: python
-
-          from paddle import fluid, nn
-          import paddle.fluid.dygraph as dg
-          import paddle.nn.functional as F
-          import numpy as np
-
-          main = fluid.Program()
-          start = fluid.Program()
-          feature_size = 6
-          num_classes = 8
-          with fluid.unique_name.guard():
-              with fluid.program_guard(main, start):
-                  x = fluid.data("input", [-1, feature_size],
-                              dtype="float32")
-                  label = fluid.data("labels", [-1, 1], dtype="int64")
-                  hsm = nn.HSigmoid(feature_size, num_classes)
-                  y = hsm(x, label)
-
-          place = fluid.CPUPlace()
-          exe = fluid.Executor(place)
-          exe.run(start)
-          feed_dict = {
-              "input": np.random.randn(4, feature_size).astype(np.float32),
-              "labels": np.random.randint(0, num_classes, (4, 1)).astype(np.int64),
-          }
-          y_np, = exe.run(main, feed=feed_dict, fetch_list=[y])
-          print(y_np.shape)
-
-          # (4, 1)
-    """
-
-    def __init__(self,
-                 feature_size,
-                 num_classes,
-                 param_attr=None,
-                 bias_attr=None,
-                 is_custom=False,
-                 is_sparse=False,
-                 dtype="float32"):
-        super(HSigmoid, self).__init__()
-        if (num_classes < 2) and (not is_custom):
-            raise ValueError(
-                "num_classes must not be less than 2 with default tree")
-
-        if (not is_custom) and (is_sparse):
-            print("Sparse mode should not be used without custom tree")
-            is_sparse = False
-
-        self._feature_size = feature_size
-        self._num_classes = num_classes
-        self._is_custom = is_custom
-        self._is_sparse = is_sparse
-
-        self._param_attr = param_attr
-        self._bias_attr = bias_attr
-
-        self._dtype = dtype
-
-        remote_prefetch = is_sparse
-        print("With sparse mode, if your models has only"
-              " small parameter prefetch may cause speed down")
-
-        C = self._num_classes if is_custom else self._num_classes - 1
-        self.weight = self.create_parameter(
-            [C, self._feature_size],
-            attr=self._param_attr,
-            is_bias=False,
-            dtype=self._dtype)
-        self.bias = self.create_parameter(
-            [C, 1], attr=self._bias_attr, is_bias=True, dtype=self._dtype)
-
-    def forward(self, input, label, path_table=None, path_code=None):
-        out = F.hsigmoid(
-            input,
-            label,
-            self.weight,
-            self.bias,
-            self._num_classes,
-            path_table=path_table,
-            path_code=path_code,
-            is_sparse=self._is_sparse)
-        return out
-
-
 class PReLU(layers.Layer):
     """
     PReLU Activation.
@@ -414,19 +328,19 @@ class PReLU(layers.Layer):
 
     Parameters:
         num_parameters (int, optional): Number of `weight` to learn. The supported values are:
-            1 - a single parameter `alpha` is used for all input channels; 
+            1 - a single parameter `alpha` is used for all input channels;
             Number of channels - a seperate `alpha` is used for each input channel.
             Default is 1.
         init (float, optional): Init value of learnable `weight`. Default is 0.25.
-        weight_attr(ParamAttr, optional): The parameter attribute for the learnable `weight`. 
+        weight_attr(ParamAttr, optional): The parameter attribute for the learnable `weight`.
             Default is None. For more information, please refer to :ref:`api_fluid_ParamAttr`.
         name (str, optional): Name for the operation (optional, default is None).
             For more information, please refer to :ref:`api_guide_Name`.
-    
+
     Shape:
         - input: Tensor with any shape. Default dtype is float32.
         - output: Tensor with the same shape as input.
-    
+
     Examples:
         .. code-block:: python
 
@@ -487,7 +401,7 @@ class ReLU(layers.Layer):
     Shape:
         - input: Tensor with any shape.
         - output: Tensor with the same shape as input.
-    
+
     Examples:
         .. code-block:: python
 
@@ -531,8 +445,6 @@ class ReLU6(layers.Layer):
             import paddle
             import numpy as np
 
-            paddle.disable_static()
-
             x = paddle.to_tensor(np.array([-1, 0.3, 6.5]))
             m = paddle.nn.ReLU6()
             out = m(x) # [0, 0.3, 6]
@@ -574,8 +486,6 @@ class SELU(layers.Layer):
             import paddle
             import numpy as np
 
-            paddle.disable_static()
-
             x = paddle.to_tensor(np.array([[0.0, 1.0],[2.0, 3.0]]))
             m = paddle.nn.SELU()
             out = m(x) # [[0, 1.050701],[2.101402, 3.152103]]
@@ -613,11 +523,11 @@ class LeakyReLU(layers.Layer):
             :math:`x < 0` . Default is 0.01.
         name (str, optional): Name for the operation (optional, default is None).
             For more information, please refer to :ref:`api_guide_Name`.
-    
+
     Shape:
         - input: Tensor with any shape.
         - output: Tensor with the same shape as input.
-    
+
     Examples:
         .. code-block:: python
 
@@ -643,11 +553,11 @@ def forward(self, x):
 class Sigmoid(layers.Layer):
     """
     this interface is used to construct a callable object of the ``Sigmoid`` class. This layer calcluate the `sigmoid` of input x.
-    
+
     .. math::
 
         Sigmoid(x) = \frac{1}{1 + e^{-x}}
-    
+
     Parameters:
         name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`.
 
@@ -656,7 +566,7 @@ class Sigmoid(layers.Layer):
 
     Returns:
         A callable object of Sigmoid.
-    
+
     Examples:
 
         .. code-block:: python
@@ -680,6 +590,53 @@ def forward(self, x):
         return F.sigmoid(x, self.name)
 
 
+class Hardsigmoid(layers.Layer):
+    """
+    This interface is used to construct a callable object of the ``Hardsigmoid`` class.
+    This layer calcluate the `hardsigmoid` of input x.
+
+    A 3-part piecewise linear approximation of sigmoid(https://arxiv.org/abs/1603.00391),
+    which is much faster than sigmoid.
+
+    .. math::
+
+        Hardsigmoid(x)=
+            \\left\\{
+            \\begin{aligned}
+            &0, & & \\text{if } x \\leq -3 \\\\
+            &1, & & \\text{if } x \\geq 3 \\\\
+            &x/6 + 1/2, & & \\text{otherwise}
+            \\end{aligned}
+            \\right.
+
+    Parameters:
+        name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`.
+
+    Shape:
+        x: N-D tensor, available dtype is float32, float64.
+
+    Returns:
+        A callable object of Hardsigmoid.
+
+    Examples:
+
+        .. code-block:: python
+
+          import paddle
+
+          m = paddle.nn.Sigmoid()
+          x = paddle.to_tensor([-4., 5., 1.])
+          out = m(x) # [0., 1, 0.666667]
+    """
+
+    def __init__(self, name=None):
+        super(Hardsigmoid, self).__init__()
+        self.name = name
+
+    def forward(self, x):
+        return F.hardsigmoid(x, self.name)
+
+
 class Softplus(layers.Layer):
     """
     Softplus Activation
@@ -705,8 +662,6 @@ class Softplus(layers.Layer):
             import paddle
             import numpy as np
 
-            paddle.disable_static()
-
             x = paddle.to_tensor(np.array([-0.4, -0.2, 0.1, 0.3]))
             m = paddle.nn.Softplus()
             out = m(x) # [0.513015, 0.598139, 0.744397, 0.854355]
@@ -749,8 +704,6 @@ class Softshrink(layers.Layer):
             import paddle
             import numpy as np
 
-            paddle.disable_static()
-
             x = paddle.to_tensor(np.array([-0.9, -0.2, 0.1, 0.8]))
             m = paddle.nn.Softshrink()
             out = m(x) # [-0.4, 0, 0, 0.3]
@@ -787,8 +740,6 @@ class Softsign(layers.Layer):
             import paddle
             import numpy as np
 
-            paddle.disable_static()
-
             x = paddle.to_tensor(np.array([-0.4, -0.2, 0.1, 0.3]))
             m = paddle.nn.Softsign()
             out = m(x) # [-0.285714, -0.166667, 0.0909091, 0.230769]
@@ -802,6 +753,41 @@ def forward(self, x):
         return F.softsign(x, self._name)
 
 
+class Swish(layers.Layer):
+    """
+    Swish Activation.
+
+    .. math::
+
+        Swish(x) = \\frac{x}{1 + e^{-x}}
+
+    Parameters:
+        name (str, optional): Name for the operation (optional, default is None).
+            For more information, please refer to :ref:`api_guide_Name`.
+
+    Shape:
+        - input: Tensor with any shape.
+        - output: Tensor with the same shape as input.
+
+    Examples:
+        .. code-block:: python
+
+            import paddle
+            import numpy as np
+
+            x = paddle.to_tensor(np.array([-2., 0., 1.]))
+            m = paddle.nn.Swish()
+            out = m(x) # [-0.238406, 0., 0.731059]
+    """
+
+    def __init__(self, name=None):
+        super(Swish, self).__init__()
+        self._name = name
+
+    def forward(self, x):
+        return F.swish(x, self._name)
+
+
 class Tanhshrink(layers.Layer):
     """
     Tanhshrink Activation
@@ -824,8 +810,6 @@ class Tanhshrink(layers.Layer):
             import paddle
             import numpy as np
 
-            paddle.disable_static()
-
             x = paddle.to_tensor(np.array([-0.4, -0.2, 0.1, 0.3]))
             m = paddle.nn.Tanhshrink()
             out = m(x) # [-0.020051, -0.00262468, 0.000332005, 0.00868739]
@@ -839,10 +823,50 @@ def forward(self, x):
         return F.tanhshrink(x, self._name)
 
 
+class ThresholdedReLU(layers.Layer):
+    """
+    Thresholded ReLU Activation
+
+    .. math::
+
+        ThresholdedReLU(x) = \\begin{cases}
+                               x, \\text{if } x > threshold \\\\
+                               0, \\text{otherwise}
+                              \\end{cases}
+
+    Parameters:
+        threshold (float, optional): The value of threshold for ThresholdedReLU. Default is 1.0
+        name (str, optional): Name for the operation (optional, default is None).
+            For more information, please refer to :ref:`api_guide_Name`.
+
+    Shape:
+        - input: Tensor with any shape.
+        - output: Tensor with the same shape as input.
+
+    Examples:
+        .. code-block:: python
+
+            import paddle
+            import numpy as np
+
+            x = paddle.to_tensor(np.array([2., 0., 1.]))
+            m = paddle.nn.ThresholdedReLU()
+            out = m(x) # [2., 0., 0.]
+    """
+
+    def __init__(self, threshold=1.0, name=None):
+        super(ThresholdedReLU, self).__init__()
+        self._threshold = threshold
+        self._name = name
+
+    def forward(self, x):
+        return F.thresholded_relu(x, self._threshold, self._name)
+
+
 class LogSigmoid(layers.Layer):
     """
     LogSigmoid Activation.
-    
+
     .. math::
 
         LogSigmoid(x) = log \\frac{1}{1 + e^{-x}}
@@ -851,11 +875,11 @@ class LogSigmoid(layers.Layer):
         x (Tensor): The input Tensor with data type float32, or float64.
         name (str, optional): Name for the operation (optional, default is None).
             For more information, please refer to :ref:`api_guide_Name`.
-    
+
     Shape:
         - input: Tensor with any shape.
         - output: Tensor with the same shape as input.
-    
+
     Examples:
         .. code-block:: python
 
@@ -961,7 +985,7 @@ class Softmax(layers.Layer):
             :math:`axis + D` . Default is -1.
         dtype (str|np.dtype|core.VarDesc.VarType, optional): The desired data
             type of the output tensor. If dtype is specified, ``x`` is casted
-            to ``dtype`` before the operation is performed. This is useful for 
+            to ``dtype`` before the operation is performed. This is useful for
             preventing data type overflows. Supported dtype: float32, float64.
             If ``dtype`` is None, the output Tensor has the same dtype as x.
             Default is None.
@@ -1013,7 +1037,7 @@ class LogSoftmax(layers.Layer):
 
     .. math::
 
-        Out[i, j] = log(softmax(x)) 
+        Out[i, j] = log(softmax(x))
                   = log(\\frac{\exp(X[i, j])}{\\sum_j(exp(X[i, j])})
 
     Parameters:
@@ -1023,7 +1047,7 @@ class LogSoftmax(layers.Layer):
             same way as :math:`axis + D` . Default is -1.
         name (str, optional): Name for the operation (optional, default is None).
             For more information, please refer to :ref:`api_guide_Name`.
- 
+
     Shape:
         - input: Tensor with any shape.
         - output: Tensor with the same shape as input.
@@ -1060,3 +1084,64 @@ def __init__(self, axis=-1, name=None):
 
     def forward(self, x):
         return F.log_softmax(x, self._axis)
+
+
+class Maxout(layers.Layer):
+    """
+    Maxout Activation.
+
+    Assumed the input shape is (N, Ci, H, W).
+    The output shape is (N, Co, H, W).
+    Then Co = Ci/groups and the operator formula is as follows:
+
+    .. math::
+
+        &out_{si+j} = \max_{k} x_{gsi + sk + j} \\\\
+        &g = groups \\\\
+        &s = \\frac{input.size}{num\\_channels} \\\\
+        &0 \\le i < \\frac{num\\_channels}{groups} \\\\
+        &0 \\le j < s \\\\
+        &0 \\le k < groups
+
+    Parameters:
+        groups (int, optional): The groups number of maxout. `groups` specifies the
+            index of channel dimension where maxout will be performed. This must be
+            a factor of number of features. Default is 1.
+        axis (int, optional): The axis along which to perform maxout calculations.
+            It should be 1 when data format is NCHW, be -1 or 3 when data format
+            is NHWC. If ``axis`` < 0, it works the same way as :math:`axis + D` ,
+            where D is the dimensions of ``x`` . Default is 1.
+        name (str, optional): Name for the operation (optional, default is None).
+            For more information, please refer to :ref:`api_guide_Name`.
+
+    Shape:
+        - input: :math:`(N, C_{in}, H_{in}, W_{in})`
+        - output: :math:`(N, C_{out}, H_{out}, W_{out})`
+
+    Examples:
+        .. code-block:: python
+
+            import paddle
+
+            x = paddle.rand([1, 2, 3, 4])
+            # [[[[0.5002636  0.22272532 0.17402348 0.2874594 ]
+            #    [0.95313174 0.6228939  0.7129065  0.7087491 ]
+            #    [0.02879342 0.88725346 0.61093384 0.38833922]]
+            #   [[0.5231306  0.03807496 0.91661984 0.15602879]
+            #    [0.666127   0.616567   0.30741522 0.24044901]
+            #    [0.7142536  0.7351477  0.31588817 0.23782359]]]]
+            m = paddle.nn.Maxout(groups=2)
+            out = m(x)
+            # [[[[0.5231306  0.22272532 0.91661984 0.2874594 ]
+            #    [0.95313174 0.6228939  0.7129065  0.7087491 ]
+            #    [0.7142536  0.88725346 0.61093384 0.38833922]]]]
+    """
+
+    def __init__(self, groups, axis=1, name=None):
+        super(Maxout, self).__init__()
+        self._groups = groups
+        self._axis = axis
+        self._name = name
+
+    def forward(self, x):
+        return F.maxout(x, self._groups, self._axis, self._name)
diff --git a/python/paddle/nn/layer/loss.py b/python/paddle/nn/layer/loss.py
index 98048bb7e64cf..5ce4baca55749 100644
--- a/python/paddle/nn/layer/loss.py
+++ b/python/paddle/nn/layer/loss.py
@@ -23,6 +23,7 @@
 __all__ = [
     'BCEWithLogitsLoss',
     'CrossEntropyLoss',
+    'HSigmoidLoss',
     'MSELoss',
     'L1Loss',
     'NLLLoss',
@@ -251,6 +252,128 @@ def forward(self, input, label):
             reduction=self.reduction)
 
 
+class HSigmoidLoss(fluid.dygraph.Layer):
+    """
+    Hierarchical Sigmoid Layer.
+    
+    The hierarchical sigmoid organizes the classes into a complete binary tree to reduce the computational complexity
+    and speed up the model training, especially the training of language model.
+    Each leaf node of the complete binary tree represents a class(word) and each non-leaf node acts as a binary classifier.
+    For each class(word), there's a unique path from root to itself, hsigmoid calculate the cost for each non-leaf node on
+    the path, and sum them to get a total cost.
+    Comparing to softmax, the OP can reduce the computational complexity from :math:`O(N)` to :math:`O(logN)`, where :math:`N`
+    represents the number of classes or the size of word dict.
+
+    The OP supports default tree and custom tree. For the default tree, you can refer to `Hierarchical Probabilistic Neural
+    Network Language Model <http://www.iro.umontreal.ca/~lisa/pointeurs/hierarchical-nnlm-aistats05.pdf>_`. For the custom
+    tree, you need to set :attr:`is_custom` to True, and do the following steps (take the language model as an example):
+
+    1. Using a custom word dict to build a binary tree, each leaf node should be an word in the word dict.
+    2. Creating a dict map word_id -> path that from the word to the root node, we call it path_table.
+    3. Creating a dict map word_id -> code of path that from the word to the root node, we call it path_code.
+       Code means the label of each binary classifier, 1 indicate true, 0 indicate false.
+    4. Now, each word should has its path and code along the path, you can pass a batch of path and code related
+       to the same batch of inputs.
+
+    Parameters:
+        feature_size (int): The number of features.
+        num_classes (int): The number of classes or the size of word dict, must be greater than 2.
+            If the default tree is used (:attr:`is_custom` is set to False), :attr:`num_classes`
+            should not be None. If the custom tree is used (:attr:`is_custom` is set to True),
+            :attr:`num_classes` should be the number of non-leaf nodes, which indicates the num of
+            classes using by the binary classifier.
+        weight_attr (ParamAttr, optional): The parameter attribute for the learnable weights
+            of hsigmoid. If it is set to None or one attribute of ParamAttr, hsigmoid will create a
+            ParamAttr as param_attr. If the Initializer of the param_attr is not set, the parameter is
+            initialized with Xavier. Default is None.
+        bias_attr (ParamAttr|bool, optional): The parameter attribute for the bias of hsigmoid. If it
+            is set to False, no bias will be added. If it is set to None or one attribute of ParamAttr,
+            hsigmoid will create a ParamAttr as bias_attr. If the Initializer of the bias_attr is not
+            set, the bias is initialized zero. Default is None.
+        is_custom (bool, optional): Whether use custom binary tree. If it's True, `path_table` and 
+            `path_code` should be passed to its forward method, otherwise `path_table` and `path_code`
+            should not be passed to its forward method. Default is False.
+        is_sparse (bool, optional): Whether use sparse updating instead of dense updating, if it's True,
+            the gradient of weight and input will be sparse. Default is False.
+        name (str, optional): Name for the operation (optional, default is None).
+            For more information, please refer to :ref:`api_guide_Name`.
+
+    Shape:
+        input (Tensor): The input tensor. The shapes is [N, D], where N is batch size and D is feature size. It's data type should be float32, float64.
+        label (Tensor): It's shapes is [N, 1]. It's data type should be int64.
+        output (Tensor): The HSigmoid Loss of ``input`` and ``label``. Shape is [N, 1]
+
+    Examples:
+        .. code-block:: python
+
+            import paddle
+            paddle.set_device('cpu')
+
+            input = paddle.uniform([2, 3])
+            # [[-0.2820413   0.9528898  -0.81638825] # random
+            #  [-0.6733154  -0.33866507  0.25770962]] # random
+            label = paddle.to_tensor([0, 1, 4, 5])
+            m = paddle.nn.HSigmoidLoss(3, 5)
+            out = m(input, label)
+            # [[2.4543471]
+            #  [1.9359267]]
+    """
+
+    def __init__(self,
+                 feature_size,
+                 num_classes,
+                 weight_attr=None,
+                 bias_attr=None,
+                 is_custom=False,
+                 is_sparse=False,
+                 name=None):
+        super(HSigmoidLoss, self).__init__()
+        if (num_classes < 2) and (not is_custom):
+            raise ValueError(
+                "num_classes must not be less than 2 with default tree")
+
+        if (not is_custom) and (is_sparse):
+            print("Sparse mode should not be used without custom tree")
+            is_sparse = False
+
+        self._feature_size = feature_size
+        self._num_classes = num_classes
+        self._is_custom = is_custom
+        self._is_sparse = is_sparse
+
+        self._weight_attr = weight_attr
+        self._bias_attr = bias_attr
+
+        self._name = name
+        self._dtype = paddle.get_default_dtype()
+
+        remote_prefetch = is_sparse
+        print("With sparse mode, if your models has only"
+              " small parameter prefetch may cause speed down")
+
+        C = self._num_classes if is_custom else self._num_classes - 1
+        self.weight = self.create_parameter(
+            [C, self._feature_size],
+            attr=self._weight_attr,
+            is_bias=False,
+            dtype=self._dtype)
+        self.bias = self.create_parameter(
+            [C, 1], attr=self._bias_attr, is_bias=True, dtype=self._dtype)
+
+    def forward(self, input, label, path_table=None, path_code=None):
+        out = F.hsigmoid_loss(
+            input,
+            label,
+            self._num_classes,
+            self.weight,
+            self.bias,
+            path_table=path_table,
+            path_code=path_code,
+            is_sparse=self._is_sparse,
+            name=self._name)
+        return out
+
+
 class MSELoss(fluid.dygraph.layers.Layer):
     """
     **Mean Square Error Loss**
diff --git a/python/paddle/static/nn/__init__.py b/python/paddle/static/nn/__init__.py
index 31a99f6282718..cd089432b1ca3 100644
--- a/python/paddle/static/nn/__init__.py
+++ b/python/paddle/static/nn/__init__.py
@@ -27,7 +27,6 @@
     'data_norm',
     'deformable_conv',
     'group_norm',
-    'hsigmoid',
     'instance_norm',
     'layer_norm',
     'multi_box_head',
@@ -39,7 +38,8 @@
     'switch_case',
 ]
 
-from ...fluid.layers import fc  #DEFINE_ALIAS
+from .common import fc  #DEFINE_ALIAS
+
 from ...fluid.layers import batch_norm  #DEFINE_ALIAS
 from ...fluid.layers import bilinear_tensor_product  #DEFINE_ALIAS
 from ...fluid.layers import case  #DEFINE_ALIAS
@@ -52,7 +52,6 @@
 from ...fluid.layers import data_norm  #DEFINE_ALIAS
 from ...fluid.layers import deformable_conv  #DEFINE_ALIAS
 from ...fluid.layers import group_norm  #DEFINE_ALIAS
-from ...fluid.layers import hsigmoid  #DEFINE_ALIAS
 from ...fluid.layers import instance_norm  #DEFINE_ALIAS
 from ...fluid.layers import layer_norm  #DEFINE_ALIAS
 from ...fluid.layers import multi_box_head  #DEFINE_ALIAS
diff --git a/python/paddle/static/nn/common.py b/python/paddle/static/nn/common.py
new file mode 100644
index 0000000000000..59ffacdaebed5
--- /dev/null
+++ b/python/paddle/static/nn/common.py
@@ -0,0 +1,165 @@
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+from paddle.fluid.framework import static_only
+
+__all__ = ['fc']
+
+
+@static_only
+def fc(x,
+       size,
+       num_flatten_dims=1,
+       weight_attr=None,
+       bias_attr=None,
+       activation=None,
+       name=None):
+    """
+
+    Fully-Connected layer can take a tensor or a list of tensor as its inputs.
+    It creates a 2-D weight tensor for each input tensor, which represents its
+    weight matrix from each input unit to each output unit. The fully connected
+    layer multiplies each input tensor with its corresponding weight to produce
+    an output tensor with shape :math:`[batch\_size, *, size]` , where :math:`*`
+    means any number of additional dimensions. If a list of tensor is given,
+    the results of multiple output tensors with shape :math:`[batch\_size, *, size]`
+    will be summed up. If :attr:`bias_attr` is not False, a 1-D bias tensor will
+    be created and added to the output. Finally, if :attr:`activation` is not None,
+    it will be applied to the output as well.
+
+    For a single input tensor :math:`X` , the equation is:
+
+    .. math::
+
+        Out = Act({XW + b})
+
+    For a list of input tensor, the equation is:
+
+    .. math::
+
+        Out = Act({\sum_{i=0}^{N-1}X_iW_i + b})
+
+    where:
+
+    * :math:`N`: The number of the input tensors. :math:`N` equals to :math:`len(X)` if :math:`X` is list of tensor.
+    * :math:`X_i`: The i-th input tensor.
+    * :math:`W_i`: The i-th weight matrix corresponding i-th input tensor.
+    * :math:`b`: The bias created by this layer (if needed).
+    * :math:`Act`: The activation function.
+    * :math:`Out`: The output tensor.
+
+    .. code-block:: text
+
+        # Case 1, input is a single tensor:
+        x.data = [[[0.1, 0.2],
+                   [0.3, 0.4]]]
+        x.shape = (1, 2, 2) # 1 is batch_size
+
+        out = paddle.static.nn.fc(x=x, size=1, num_flatten_dims=2)
+
+        # Get the output:
+        out.data = [[0.83234344], [0.34936576]]
+        out.shape = (1, 2, 1)
+
+        # Case 2, input is a list of tensor:
+        x0.data = [[[0.1, 0.2],
+                    [0.3, 0.4]]]
+        x0.shape = (1, 2, 2) # 1 is batch_size
+
+        x1.data = [[[0.1, 0.2, 0.3]]]
+        x1.shape = (1, 1, 3)
+
+        out = paddle.static.nn.fc(x=[x0, x1], size=2)
+
+        # Get the output:
+        out.data = [[0.18669507, 0.1893476]]
+        out.shape = (1, 2)
+
+    Args:
+        x (Tensor|list of Tensor): A tensor or a list of tensor. The number of dimensions
+            of each tensor is at least 2. The data type should be float16, float32 or float64.
+        size (int): The number of output units in this layer, which also means the feature
+            size of output tensor.
+        num_flatten_dims (int, optional): The fc layer can accept an input tensor with more than
+            two dimensions. If this happens, the multi-dimensional tensor will first be flattened
+            into a 2-D matrix. The parameter :attr:`num_flatten_dims` determines how the input
+            tensor is flattened: the first :math:`num\_flatten\_dims` (inclusive, index starts from 1)
+            dimensions will be flatten to form the first dimension of the final matrix (height of
+            the matrix), and the rest :math:`rank(x) - num\_flatten\_dims` dimensions are
+            flattened to form the second dimension of the final matrix (width of the matrix).
+            For example, assuming that :attr:`x` is a 5-dimensional tensor with a shape
+            :math:`[2, 3, 4, 5, 6]` , and :attr:`num_flatten_dims` = 3.
+            Then, the flattened matrix will have a shape :math:`[2 * 3 * 4, 5 * 6] = [24, 30]` .
+            Default: 1.
+        weight_attr (ParamAttr, optional): The attribute for the learnable weight.
+            The default value is None, and the weight will be initialized to zero.
+            For detailed information, please refer to :attr:`paddle.ParamAttr`.
+        bias_attr (ParamAttr|bool, optional): The attribute of the learnable bias. 
+            If it is set to False, no bias will be added to the output.
+            If it is set to None or one kind of ParamAttr, a bias parameter will
+            be created according to ParamAttr. For detailed information, please refer
+            to :attr:`paddle.ParamAttr`. The default value is None and the bias will be
+            initialized to zero. 
+        activation (str, optional): Activation to be applied to the output of
+            this layer, such as tanh, softmax, sigmoid, relu. For more information,
+            please refer to :ref:`api_guide_activations_en` . Default: None.
+        name (str, optional): The default value is None. Normally there is no need for user to set
+            it. For more information, please refer to :ref:`api_guide_Name` .
+
+    Returns:
+        Tensor, its shape is :math:`[batch\_size, *, size]` , and the data type is same with input.
+
+    Raises:
+        ValueError: If dimensions of the input tensor is less than 2.
+
+    Examples:
+        .. code-block:: python
+
+          import paddle
+          paddle.enable_static()
+
+          # When input is a single tensor
+          x = paddle.static.data(name="x", shape=[1, 2, 2], dtype="float32")
+          # x: [[[0.1 0.2]
+          #      [0.3 0.4]]]
+          out = paddle.static.nn.fc(
+              x=x,
+              size=1,
+              num_flatten_dims=2,
+              weight_attr=paddle.ParamAttr(initializer=paddle.nn.initializer.Constant(value=0.5)),
+              bias_attr=paddle.ParamAttr(initializer=paddle.nn.initializer.Constant(value=1.0)))
+          # out: [[[1.15]
+          #        [1.35]]]
+
+          # When input is multiple tensors
+          x0 = paddle.static.data(name="x0", shape=[1, 2, 2], dtype="float32")
+          # x0: [[[0.1 0.2]
+          #       [0.3 0.4]]]
+          x1 = paddle.static.data(name="x1", shape=[1, 1, 3], dtype="float32")
+          # x1: [[[0.1 0.2 0.3]]]
+          out = paddle.static.nn.fc(
+              x=[x0, x1],
+              size=2,
+              weight_attr=paddle.ParamAttr(initializer=paddle.nn.initializer.Constant(value=0.5)),
+              bias_attr=paddle.ParamAttr(initializer=paddle.nn.initializer.Constant(value=1.0)))
+          # out: [[1.8 1.8]]
+    """
+    return paddle.fluid.layers.fc(input=x,
+                                  size=size,
+                                  num_flatten_dims=num_flatten_dims,
+                                  param_attr=weight_attr,
+                                  bias_attr=bias_attr,
+                                  act=activation,
+                                  name=name)
diff --git a/python/paddle/tensor/linalg.py b/python/paddle/tensor/linalg.py
index 2dcdf1603a737..2745464995f5d 100644
--- a/python/paddle/tensor/linalg.py
+++ b/python/paddle/tensor/linalg.py
@@ -687,27 +687,24 @@ def t(input, name=None):
 
 def cross(x, y, axis=None, name=None):
     """
-	:alias_main: paddle.cross
-	:alias: paddle.cross,paddle.tensor.cross,paddle.tensor.linalg.cross
-
     Computes the cross product between two tensors along an axis.
+    
     Inputs must have the same shape, and the length of their axes should be equal to 3.
     If `axis` is not given, it defaults to the first axis found with the length 3.
     
     Args:
-        x (Variable): The first input tensor variable.
-        y (Variable): The second input tensor variable.
+        x (Tensor): The first input tensor.
+        y (Tensor): The second input tensor.
         axis (int, optional): The axis along which to compute the cross product. It defaults to the first axis found with the length 3.
-        name (str, optional): The default value is None.  Normally there is no need for
-            user to set this property.  For more information, please refer to :ref:`api_guide_Name`
+        name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`.
 
     Returns:
-        Variable: A Tensor with same data type as `x`.
+        Tensor. A Tensor with same data type as `x`.
         
     Examples:
         .. code-block:: python
+
             import paddle
-            paddle.disable_static()
 
             x = paddle.to_tensor([[1.0, 1.0, 1.0],
                                   [2.0, 2.0, 2.0],
@@ -715,14 +712,13 @@ def cross(x, y, axis=None, name=None):
             y = paddle.to_tensor([[1.0, 1.0, 1.0],
                                   [1.0, 1.0, 1.0],
                                   [1.0, 1.0, 1.0]])
+
             z1 = paddle.cross(x, y)
-            print(z1.numpy())
             # [[-1. -1. -1.]
             #  [ 2.  2.  2.]
             #  [-1. -1. -1.]]
 
             z2 = paddle.cross(x, y, axis=1)
-            print(z2.numpy())
             # [[0. 0. 0.]
             #  [0. 0. 0.]
             #  [0. 0. 0.]]
diff --git a/python/paddle/tensor/math.py b/python/paddle/tensor/math.py
index 34d15ed0bab61..2a370422eed7e 100755
--- a/python/paddle/tensor/math.py
+++ b/python/paddle/tensor/math.py
@@ -1622,39 +1622,37 @@ def kron(x, y, name=None):
 
 def cumsum(x, axis=None, dtype=None, name=None):
     """
-    The cumulative sum of the elements along a given axis. The first element of the result is the same of the first element of the input. 
+    The cumulative sum of the elements along a given axis. 
+    
+    **Note**:
+    The first element of the result is the same of the first element of the input. 
 
     Args:
-        x (Tensor): Input of cumsum operator, the Tensor needed to be cumsumed. 
+        x (Tensor): The input tensor needed to be cumsumed.
         axis (int, optional): The dimension to accumulate along. -1 means the last dimension. The default (None) is to compute the cumsum over the flattened array.
         dtype (str, optional): The data type of the output tensor, can be float32, float64, int32, int64. If specified, the input tensor is casted to dtype before the operation is performed. This is useful for preventing data type overflows. The default value is None. 
         name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`.
 
     Returns:
-        Tensor, the result of cumsum operator, output of cumsum operator. 
+        Tensor, the result of cumsum operator. 
 
     Examples:
         .. code-block:: python
             
             import paddle
-            import numpy as np
-
-            paddle.disable_static()
-            data_np = np.arange(12).reshape(3, 4)
-            data = paddle.to_tensor(data_np)
+            
+            data = paddle.arange(12)
+            data = paddle.reshape(data, (3, 4))
 
             y = paddle.cumsum(data)
-            print(y.numpy())
             # [ 0  1  3  6 10 15 21 28 36 45 55 66]
 
             y = paddle.cumsum(data, axis=0)
-            print(y.numpy())
             # [[ 0  1  2  3]
             #  [ 4  6  8 10]
             #  [12 15 18 21]]
             
             y = paddle.cumsum(data, axis=-1)
-            print(y.numpy())
             # [[ 0  1  3  6]
             #  [ 4  9 15 22]
             #  [ 8 17 27 38]]
diff --git a/python/paddle/tests/test_model.py b/python/paddle/tests/test_model.py
index 96c4483a35ba8..56105b6d7f15a 100644
--- a/python/paddle/tests/test_model.py
+++ b/python/paddle/tests/test_model.py
@@ -501,6 +501,11 @@ def test_summary_nlp(self):
         rnn = paddle.nn.LSTM(16, 32, 2)
         paddle.summary(rnn, [(-1, 23, 16), ((2, None, 32), (2, -1, 32))])
 
+    def test_summary_dtype(self):
+        input_shape = (3, 1)
+        net = paddle.nn.Embedding(10, 3, sparse=True)
+        paddle.summary(net, input_shape, dtypes='int64')
+
     def test_summary_error(self):
         with self.assertRaises(TypeError):
             nlp_net = paddle.nn.GRU(input_size=2, hidden_size=3, num_layers=3)
@@ -551,9 +556,10 @@ def test_export_deploy_model(self):
                 shutil.rmtree(save_dir)
             paddle.enable_static()
 
-    def test_dygraph_export_deploy_model_without_inputs(self):
+    def test_dygraph_export_deploy_model_about_inputs(self):
         mnist_data = MnistDataset(mode='train')
         paddle.disable_static()
+        # without inputs
         for initial in ["fit", "train_batch", "eval_batch", "test_batch"]:
             save_dir = tempfile.mkdtemp()
             if not os.path.exists(save_dir):
@@ -579,6 +585,18 @@ def test_dygraph_export_deploy_model_without_inputs(self):
 
             model.save(save_dir, training=False)
             shutil.rmtree(save_dir)
+        # with inputs, and the type of inputs is InputSpec
+        save_dir = tempfile.mkdtemp()
+        if not os.path.exists(save_dir):
+            os.makedirs(save_dir)
+        net = LeNet()
+        inputs = InputSpec([None, 1, 28, 28], 'float32', 'x')
+        model = Model(net, inputs)
+        optim = fluid.optimizer.Adam(
+            learning_rate=0.001, parameter_list=model.parameters())
+        model.prepare(optimizer=optim, loss=CrossEntropyLoss(reduction="sum"))
+        model.save(save_dir, training=False)
+        shutil.rmtree(save_dir)
 
 
 class TestRaiseError(unittest.TestCase):
diff --git a/python/unittest_py/requirements.txt b/python/unittest_py/requirements.txt
index 389d45fc6b95e..b61ba138441c9 100644
--- a/python/unittest_py/requirements.txt
+++ b/python/unittest_py/requirements.txt
@@ -2,4 +2,5 @@ PyGithub
 coverage
 pycrypto ; platform_system != "Windows"
 mock
+opencv-python<=4.2.0.32
 visualdl ; python_version>="3.5"
diff --git a/tools/dockerfile/Dockerfile.ubuntu b/tools/dockerfile/Dockerfile.ubuntu
index 9fe58885fa553..9b5602d4943ad 100644
--- a/tools/dockerfile/Dockerfile.ubuntu
+++ b/tools/dockerfile/Dockerfile.ubuntu
@@ -156,19 +156,14 @@ RUN pip3 --no-cache-dir install -U wheel py-cpuinfo==5.0.0 && \
 
 RUN pip3 --no-cache-dir install 'pre-commit==1.10.4' 'ipython==5.3.0' && \
     pip3 --no-cache-dir install 'ipykernel==4.6.0' 'jupyter==1.0.0' && \
-    pip3 --no-cache-dir install opencv-python==4.2.0.32 && \
     pip3.6 --no-cache-dir install 'pre-commit==1.10.4' 'ipython==5.3.0' && \
     pip3.6 --no-cache-dir install 'ipykernel==4.6.0' 'jupyter==1.0.0' && \
-    pip3.6 --no-cache-dir install opencv-python==4.2.0.32 && \
     pip3.7 --no-cache-dir install 'pre-commit==1.10.4' 'ipython==5.3.0' && \
     pip3.7 --no-cache-dir install 'ipykernel==4.6.0' 'jupyter==1.0.0' && \
-    pip3.7 --no-cache-dir install opencv-python==4.2.0.32 && \
     pip3.8 --no-cache-dir install 'pre-commit==1.10.4' 'ipython==5.3.0' && \
     pip3.8 --no-cache-dir install 'ipykernel==4.6.0' 'jupyter==1.0.0' && \
-    pip3.8 --no-cache-dir install opencv-python==4.2.0.32 && \
     pip --no-cache-dir install 'pre-commit==1.10.4' 'ipython==5.3.0' && \
-    pip --no-cache-dir install 'ipykernel==4.6.0' 'jupyter==1.0.0' && \
-    pip --no-cache-dir install opencv-python==4.2.0.32
+    pip --no-cache-dir install 'ipykernel==4.6.0'
 
 #For docstring checker
 RUN pip3 --no-cache-dir install pylint pytest astroid isort && \
diff --git a/tools/dockerfile/ubuntu16_dev.sh b/tools/dockerfile/ubuntu16_dev.sh
index e7827b6598eeb..212e9acfea541 100755
--- a/tools/dockerfile/ubuntu16_dev.sh
+++ b/tools/dockerfile/ubuntu16_dev.sh
@@ -28,11 +28,13 @@ function ref_whl(){
     ref_paddle3_whl=paddlepaddle${install_gpu}-${PADDLE_BRANCH}-cp35-cp35m-linux_x86_64.whl
     ref_paddle36_whl=paddlepaddle${install_gpu}-${PADDLE_BRANCH}-cp36-cp36m-linux_x86_64.whl
     ref_paddle37_whl=paddlepaddle${install_gpu}-${PADDLE_BRANCH}-cp37-cp37m-linux_x86_64.whl
+    ref_paddle38_whl=paddlepaddle${install_gpu}-${PADDLE_BRANCH}-cp38-cp38-linux_x86_64.whl
   else
     ref_paddle_whl=paddlepaddle${install_gpu}-${PADDLE_BRANCH}-cp27-cp27mu-linux_x86_64.whl
     ref_paddle3_whl=paddlepaddle${install_gpu}-${PADDLE_BRANCH}-cp35-cp35m-linux_x86_64.whl
     ref_paddle36_whl=paddlepaddle${install_gpu}-${PADDLE_BRANCH}-cp36-cp36m-linux_x86_64.whl
     ref_paddle37_whl=paddlepaddle${install_gpu}-${PADDLE_BRANCH}-cp37-cp37m-linux_x86_64.whl
+    ref_paddle38_whl=paddlepaddle${install_gpu}-${PADDLE_BRANCH}-cp38-cp38-linux_x86_64.whl
   fi
   
   if [[ ${PADDLE_BRANCH} != "0.0.0" && ${WITH_GPU} == "ON" ]]; then
@@ -40,11 +42,13 @@ function ref_whl(){
     ref_paddle3_whl=paddlepaddle${install_gpu}-${PADDLE_BRANCH}.post${ref_CUDA_MAJOR}${CUDNN_MAJOR}-cp35-cp35m-linux_x86_64.whl
     ref_paddle36_whl=paddlepaddle${install_gpu}-${PADDLE_BRANCH}.post${ref_CUDA_MAJOR}${CUDNN_MAJOR}-cp36-cp36m-linux_x86_64.whl
     ref_paddle37_whl=paddlepaddle${install_gpu}-${PADDLE_BRANCH}.post${ref_CUDA_MAJOR}${CUDNN_MAJOR}-cp37-cp37m-linux_x86_64.whl
+    ref_paddle38_whl=paddlepaddle${install_gpu}-${PADDLE_BRANCH}.post${ref_CUDA_MAJOR}${CUDNN_MAJOR}-cp38-cp38-linux_x86_64.whl
   else
     ref_paddle_whl=paddlepaddle${install_gpu}-${PADDLE_BRANCH}-cp27-cp27mu-linux_x86_64.whl
     ref_paddle3_whl=paddlepaddle${install_gpu}-${PADDLE_BRANCH}-cp35-cp35m-linux_x86_64.whl
     ref_paddle36_whl=paddlepaddle${install_gpu}-${PADDLE_BRANCH}-cp36-cp36m-linux_x86_64.whl
     ref_paddle37_whl=paddlepaddle${install_gpu}-${PADDLE_BRANCH}-cp37-cp37m-linux_x86_64.whl
+    ref_paddle38_whl=paddlepaddle${install_gpu}-${PADDLE_BRANCH}-cp38-cp38-linux_x86_64.whl
   fi
 }
 
@@ -55,6 +59,7 @@ function install_whl(){
   sed -i "${dockerfile_line}i RUN wget ${ref_web}/${ref_paddle3_whl} && pip3.5 install ${ref_paddle3_whl} && rm  -f ${ref_paddle3_whl}" Dockerfile.tmp
   sed -i "${dockerfile_line}i RUN wget ${ref_web}/${ref_paddle36_whl} && pip3.6 install ${ref_paddle36_whl} && rm -f ${ref_paddle36_whl}" Dockerfile.tmp
   sed -i "${dockerfile_line}i RUN wget ${ref_web}/${ref_paddle37_whl} && pip3.7 install ${ref_paddle37_whl} && rm -f ${ref_paddle37_whl}" Dockerfile.tmp
+  sed -i "${dockerfile_line}i RUN wget ${ref_web}/${ref_paddle38_whl} && pip3.8 install ${ref_paddle38_whl} && rm -f ${ref_paddle38_whl}" Dockerfile.tmp
 }
 
 function install_gcc(){
diff --git a/tools/get_cpu_info.sh b/tools/get_cpu_info.sh
index a1881f551da1c..81eb19dc0661e 100755
--- a/tools/get_cpu_info.sh
+++ b/tools/get_cpu_info.sh
@@ -36,7 +36,7 @@ if [ $numa_nodes -lt $sockets ]; then
 fi
 
 echo "********** Software Information **********"
-echo "OS Version             : `cat /proc/version`"
+echo "OS Version             : `uname -o`"
 echo "Kernel Release Version : `uname -r`"
 echo "Kernel Patch Version   : `uname -v`"
 echo "GCC Version            :`gcc --version | head -n 1|awk -F '\\\(GCC\\\)' '{print $2}'`"
diff --git a/tools/manylinux1/Dockerfile.cuda10_cudnn7_gcc8_ubuntu16 b/tools/manylinux1/Dockerfile.cuda10_cudnn7_gcc8_ubuntu16
index c27fdcea2401c..55c30579fb91e 100644
--- a/tools/manylinux1/Dockerfile.cuda10_cudnn7_gcc8_ubuntu16
+++ b/tools/manylinux1/Dockerfile.cuda10_cudnn7_gcc8_ubuntu16
@@ -174,16 +174,12 @@ RUN pip3 --no-cache-dir install -U wheel py-cpuinfo==5.0.0 && \
 
 RUN pip3 --no-cache-dir install pre-commit==1.10.4 ipython==5.3.0  && \
     pip3 --no-cache-dir install ipykernel==4.6.0 jupyter==1.0.0  && \
-    pip3 --no-cache-dir install opencv-python  && \
     pip3.6 --no-cache-dir install pre-commit==1.10.4 ipython==5.3.0  && \
     pip3.6 --no-cache-dir install ipykernel==4.6.0 jupyter==1.0.0  && \
-    pip3.6 --no-cache-dir install opencv-python  && \
     pip3.7 --no-cache-dir install pre-commit==1.10.4 ipython==5.3.0  && \
     pip3.7 --no-cache-dir install ipykernel==4.6.0 jupyter==1.0.0  && \
-    pip3.7 --no-cache-dir install opencv-python  && \
     pip --no-cache-dir install pre-commit==1.10.4 ipython==5.3.0  && \
-    pip --no-cache-dir install ipykernel==4.6.0 jupyter==1.0.0  && \
-    pip --no-cache-dir install  opencv-python
+    pip --no-cache-dir install ipykernel==4.6.0 
 
 #For docstring checker
 RUN pip3 --no-cache-dir install pylint pytest astroid isort